From 047904ef73d42eccb33328089642ae6ffe20318d Mon Sep 17 00:00:00 2001
From: myf272609 <myf272609@alibaba-inc.com>
Date: Mon, 26 Sep 2022 11:55:06 +0800
Subject: [PATCH 01/23] [to #42322933] fix init issues for multi-style cartoon
 models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 修复多风格模型pipeline初始化问题
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10249429
---
 modelscope/pipelines/cv/image_cartoon_pipeline.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index f34be618..72fda989 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -39,10 +39,13 @@ class ImageCartoonPipeline(Pipeline):
         super().__init__(model=model, **kwargs)
         with device_placement(self.framework, self.device_name):
             self.facer = FaceAna(self.model)
-            self.sess_anime_head = self.load_sess(
-                os.path.join(self.model, 'cartoon_h.pb'), 'model_anime_head')
-            self.sess_anime_bg = self.load_sess(
-                os.path.join(self.model, 'cartoon_bg.pb'), 'model_anime_bg')
+            with tf.Graph().as_default():
+                self.sess_anime_head = self.load_sess(
+                    os.path.join(self.model, 'cartoon_h.pb'),
+                    'model_anime_head')
+                self.sess_anime_bg = self.load_sess(
+                    os.path.join(self.model, 'cartoon_bg.pb'),
+                    'model_anime_bg')
 
         self.box_width = 288
         global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg'))

From 5e4894870bf56585f294f24bc485d97ab1420e4e Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Mon, 26 Sep 2022 12:23:28 +0800
Subject: [PATCH 02/23] [to #42322933]add t5 model / text2text generation task 
        Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10191736

    * add T5 for generation
---
 modelscope/metainfo.py                        |    3 +
 modelscope/models/nlp/T5/__init__.py          |   21 +
 modelscope/models/nlp/T5/configuration_t5.py  |  174 ++
 modelscope/models/nlp/T5/modeling_t5.py       | 2003 +++++++++++++++++
 .../models/nlp/T5/t5_for_text_generation.py   |   56 +
 modelscope/models/nlp/__init__.py             |    3 +-
 modelscope/outputs.py                         |    7 +
 modelscope/pipelines/nlp/__init__.py          |    4 +-
 .../nlp/text2text_generation_pipeline.py      |   87 +
 modelscope/preprocessors/__init__.py          |    3 +-
 modelscope/preprocessors/nlp/__init__.py      |    2 +
 modelscope/preprocessors/nlp/nlp_base.py      |   35 +
 modelscope/utils/constant.py                  |    1 +
 tests/pipelines/test_text2text_generation.py  |   61 +
 14 files changed, 2457 insertions(+), 3 deletions(-)
 create mode 100644 modelscope/models/nlp/T5/__init__.py
 create mode 100644 modelscope/models/nlp/T5/configuration_t5.py
 create mode 100644 modelscope/models/nlp/T5/modeling_t5.py
 create mode 100644 modelscope/models/nlp/T5/t5_for_text_generation.py
 create mode 100644 modelscope/pipelines/nlp/text2text_generation_pipeline.py
 create mode 100644 tests/pipelines/test_text2text_generation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 80a522b2..29a35fbe 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -65,6 +65,7 @@ class Models(object):
     plug = 'plug'
     bert_for_ds = 'bert-for-document-segmentation'
     ponet = 'ponet'
+    T5 = 'T5'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
@@ -179,6 +180,7 @@ class Pipelines(object):
     part_of_speech = 'part-of-speech'
     named_entity_recognition = 'named-entity-recognition'
     text_generation = 'text-generation'
+    text2text_generation = 'text2text-generation'
     sentiment_analysis = 'sentiment-analysis'
     sentiment_classification = 'sentiment-classification'
     text_classification = 'text-classification'
@@ -280,6 +282,7 @@ class Preprocessors(object):
     cross_encoder_tokenizer = 'cross-encoder-tokenizer'
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     text_gen_tokenizer = 'text-gen-tokenizer'
+    text2text_gen_preprocessor = 'text2text-gen-preprocessor'
     token_cls_tokenizer = 'token-cls-tokenizer'
     ner_tokenizer = 'ner-tokenizer'
     nli_tokenizer = 'nli-tokenizer'
diff --git a/modelscope/models/nlp/T5/__init__.py b/modelscope/models/nlp/T5/__init__.py
new file mode 100644
index 00000000..7c1cea36
--- /dev/null
+++ b/modelscope/models/nlp/T5/__init__.py
@@ -0,0 +1,21 @@
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .t5_for_text_generation import T5ForConditionalGeneration
+
+else:
+    _import_structure = {
+        't5_for_text_generation': ['T5ForConditionalGeneration'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/T5/configuration_t5.py b/modelscope/models/nlp/T5/configuration_t5.py
new file mode 100644
index 00000000..117a6bc1
--- /dev/null
+++ b/modelscope/models/nlp/T5/configuration_t5.py
@@ -0,0 +1,174 @@
+# Copyright 2020, The T5 Authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" T5 model configuration"""
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxSeq2SeqConfigWithPast
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class T5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`T5Model`] or a [`TFT5Model`]. It is used to
+    instantiate a T5 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the T5
+    [t5-small](https://huggingface.co/t5-small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 32128):
+            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
+            num_heads`.
+        d_ff (`int`, *optional*, defaults to 2048):
+            Size of the intermediate feed forward layer in each `T5Block`.
+        num_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
+        num_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The ratio for all dropout layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. T5v1.1 uses the
+            `"gated-gelu"` feed forward projection. Original T5 uses `"relu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    """
+    model_type = 't5'
+    keys_to_ignore_at_inference = ['past_key_values']
+    attribute_map = {
+        'hidden_size': 'd_model',
+        'num_attention_heads': 'num_heads',
+        'num_hidden_layers': 'num_layers'
+    }
+
+    def __init__(self,
+                 vocab_size=32128,
+                 d_model=512,
+                 d_kv=64,
+                 d_ff=2048,
+                 num_layers=6,
+                 num_decoder_layers=None,
+                 num_heads=8,
+                 relative_attention_num_buckets=32,
+                 relative_attention_max_distance=128,
+                 dropout_rate=0.1,
+                 layer_norm_epsilon=1e-6,
+                 initializer_factor=1.0,
+                 feed_forward_proj='relu',
+                 is_encoder_decoder=True,
+                 use_cache=True,
+                 pad_token_id=0,
+                 eos_token_id=1,
+                 **kwargs):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (num_decoder_layers if num_decoder_layers
+                                   is not None else self.num_layers
+                                   )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
+
+        act_info = self.feed_forward_proj.split('-')
+        self.dense_act_fn = act_info[-1]
+        self.is_gated_act = act_info[0] == 'gated'
+
+        if len(act_info) > 1 and act_info[0] != 'gated' or len(act_info) > 2:
+            raise ValueError(
+                f'`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer.'
+                'Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. '
+                "'gated-gelu' or 'relu'")
+
+        # for backwards compatibility
+        if feed_forward_proj == 'gated-gelu':
+            self.dense_act_fn = 'gelu_new'
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+
+
+class T5OnnxConfig(OnnxSeq2SeqConfigWithPast):
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = {
+            'input_ids': {
+                0: 'batch',
+                1: 'encoder_sequence'
+            },
+            'attention_mask': {
+                0: 'batch',
+                1: 'encoder_sequence'
+            },
+        }
+        if self.use_past:
+            common_inputs['attention_mask'][
+                1] = 'past_encoder_sequence + sequence'
+            common_inputs['decoder_input_ids'] = {0: 'batch'}
+            common_inputs['decoder_attention_mask'] = {
+                0: 'batch',
+                1: 'past_decoder_sequence + sequence'
+            }
+        else:
+            common_inputs['decoder_input_ids'] = {
+                0: 'batch',
+                1: 'decoder_sequence'
+            }
+            common_inputs['decoder_attention_mask'] = {
+                0: 'batch',
+                1: 'decoder_sequence'
+            }
+
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction='inputs')
+
+        return common_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
diff --git a/modelscope/models/nlp/T5/modeling_t5.py b/modelscope/models/nlp/T5/modeling_t5.py
new file mode 100644
index 00000000..da50741e
--- /dev/null
+++ b/modelscope/models/nlp/T5/modeling_t5.py
@@ -0,0 +1,2003 @@
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch T5 model."""
+
+import copy
+import math
+import os
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.utils.checkpoint import checkpoint
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput, Seq2SeqModelOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+from transformers.utils import (DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings,
+                                add_start_docstrings_to_model_forward,
+                                is_torch_fx_proxy, replace_return_docstrings)
+from transformers.utils.model_parallel_utils import (assert_device_map,
+                                                     get_device_map)
+
+from modelscope.utils.logger import get_logger
+from .configuration_t5 import T5Config
+
+logger = get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'T5Config'
+_TOKENIZER_FOR_DOC = 'T5Tokenizer'
+_CHECKPOINT_FOR_DOC = 't5-small'
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    't5-small',
+    't5-base',
+    't5-large',
+    't5-3b',
+    't5-11b',
+    # See all T5 models at https://huggingface.co/models?filter=t5
+]
+
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f'Converting TensorFlow checkpoint from {tf_path}')
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info(f'Loading TF weight {name} with shape {shape}')
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        tf_weights[name] = array
+
+    for txt_name in names:
+        name = txt_name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in [
+                'adam_v', 'adam_m', 'AdamWeightDecayOptimizer',
+                'AdamWeightDecayOptimizer_1', 'global_step'
+        ] for n in name):
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        if '_slot_' in name[-1]:
+            logger.info(f"Skipping {'/'.join(name)}")
+            tf_weights.pop(txt_name, None)
+            continue
+        pointer = model
+        array = tf_weights[txt_name]
+
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                scope_names = re.split(r'_(\d+)', m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] in ['kernel', 'scale', 'embedding']:
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'self_attention':
+                pointer = getattr(pointer, 'layer')
+                pointer = pointer[0]
+            elif scope_names[0] == 'enc_dec_attention':
+                pointer = getattr(pointer, 'layer')
+                pointer = pointer[1]
+            elif scope_names[0] == 'dense_relu_dense':
+                pointer = getattr(pointer, 'layer')
+                pointer = pointer[2]
+            elif scope_names[0] == 'rms_norm':
+                if hasattr(pointer, 'layer_norm'):
+                    pointer = getattr(pointer, 'layer_norm')
+                elif hasattr(pointer, 'final_layer_norm'):
+                    pointer = getattr(pointer, 'final_layer_norm')
+            elif scope_names[0] == 'scale':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif scope_names[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            elif scope_names[0] == 'decoder' and name[1] == 'logits':
+                continue
+            elif scope_names[0] == 'logits':
+                pointer = getattr(pointer, 'lm_head')
+            elif scope_names[0] == 'wi' and len(
+                    scope_names) > 1 and scope_names[1].isdigit():
+                pointer = getattr(pointer, f'wi_{scope_names[1]}')
+                continue
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if scope_names[0] not in ['kernel', 'scale', 'embedding']:
+            pointer = getattr(pointer, 'weight')
+        if scope_names[0] != 'embedding':
+            logger.info(
+                f'Transposing numpy weight of shape {array.shape} for {name}')
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f'Initialize PyTorch weight {name}')
+        pointer.data = torch.from_numpy(array.astype(np.float32))
+        tf_weights.pop(txt_name, None)
+
+    logger.info(
+        f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}."
+    )
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
+####################################################
+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice.
+
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+
+    Args:
+        device_map (`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
+            following number of attention modules:
+
+                - t5-small: 6
+                - t5-base: 12
+                - t5-large: 24
+                - t5-3b: 24
+                - t5-11b: 24
+
+    Example:
+
+    ```python
+    # Here is an example of a device map on a machine with 4 GPUs
+    # using t5-3b, which has a total of 24 attention modules:
+    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)
+    ```
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+
+    Example:
+
+    ```python
+    # On a 4 GPU machine with t5-3b:
+    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
+"""
+
+
+class T5LayerNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(
+            -1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance
+                                                    + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    T5LayerNorm = FusedRMSNorm  # noqa
+
+    logger.info(
+        'Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm'
+    )
+except ImportError:
+    # using the normal T5LayerNorm
+    pass
+except Exception:
+    logger.warning(
+        'discovered apex but it failed to load, falling back to T5LayerNorm')
+    pass
+
+
+class T5DenseReluDense(nn.Module):
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = nn.functional.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5DenseGatedGeluDense(nn.Module):
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN['gelu_new']
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerFF(nn.Module):
+
+    def __init__(self, config: T5Config):
+        super().__init__()
+        if config.feed_forward_proj == 'relu':
+            self.DenseReluDense = T5DenseReluDense(config)
+        elif config.feed_forward_proj == 'gated-gelu':
+            self.DenseReluDense = T5DenseGatedGeluDense(config)
+        else:
+            raise ValueError(
+                f'{self.config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`'
+            )
+
+        self.layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5Attention(nn.Module):
+
+    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(
+                self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+        self.gradient_checkpointing = False
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads)
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position,
+                                  bidirectional=True,
+                                  num_buckets=32,
+                                  max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(
+                torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position,
+                                           torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in
+        # positions up to max_distance
+        relateive_pos_log = torch.log(relative_position.float() / max_exact)
+        max_dis_log = math.log(max_distance / max_exact)
+        origin_relative_position = relateive_pos_log / max_dis_log * (
+            num_buckets - max_exact)
+        relative_postion_if_large = max_exact + origin_relative_position.to(
+            torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large,
+            torch.full_like(relative_postion_if_large, num_buckets - 1))
+
+        relative_buckets += torch.where(is_small, relative_position,
+                                        relative_postion_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length):
+        """Compute binned relative position bias"""
+        context_position = torch.arange(
+            query_length,
+            dtype=torch.long,
+            device=self.relative_attention_bias.weight.device)[:, None]
+        memory_position = torch.arange(
+            key_length,
+            dtype=torch.long,
+            device=self.relative_attention_bias.weight.device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(
+            relative_position_bucket
+        )  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(
+            0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key_value is not None:
+            assert (
+                len(past_key_value) == 2
+            ), f'past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states'
+            real_seq_length += past_key_value[0].shape[
+                2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[
+            1]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads,
+                               self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(
+                batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states,
+                    past_key_value):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states],
+                                              dim=2)
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(
+            hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states, self.k, key_value_states,
+            past_key_value[0] if past_key_value is not None else None)
+        value_states = project(
+            hidden_states, self.v, key_value_states,
+            past_key_value[1] if past_key_value is not None else None)
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length),
+                    device=scores.device,
+                    dtype=scores.dtype)
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1):, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        scores += position_bias
+        attn_weights = nn.functional.softmax(
+            scores.float(), dim=-1).type_as(
+                scores)  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = unshape(torch.matmul(
+            attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        present_key_value_state = (key_states,
+                                   value_states) if (self.is_decoder
+                                                     and use_cache) else None
+        outputs = (attn_output, ) + (present_key_value_state, ) + (
+            position_bias, )
+
+        if output_attentions:
+            outputs = outputs + (attn_weights, )
+        return outputs
+
+
+class T5LayerSelfAttention(nn.Module):
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.SelfAttention = T5Attention(
+            config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,
+                   ) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.EncDecAttention = T5Attention(
+            config, has_relative_attention_bias=False)
+        self.layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,
+                   ) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5Block(nn.Module):
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(
+            T5LayerSelfAttention(
+                config,
+                has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(T5LayerCrossAttention(config))
+
+        self.layer.append(T5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+
+        if past_key_value is not None:
+            if not self.is_decoder:
+                logger.warning(
+                    '`past_key_values` is passed to the encoder. Please make sure this is intended.'
+                )
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f'There should be {expected_num_past_key_values} past states. '
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f'Got {len(past_key_value)} past key / value states')
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[
+            2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(
+                hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(
+                    hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(
+                    hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[
+                    1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(
+                hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states, )
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state, ) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        # hidden-states, present_key_value_states, (self-attention position
+        # bias), (self-attention weights), (cross-attention position bias),
+        # (cross-attention weights)
+        return outputs
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = T5Config
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = 'transformer'
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            'decoder_input_ids': input_ids,
+            'input_ids': input_ids,
+            'decoder_attention_mask': input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, T5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module,
+                        (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+            # Mesh TensorFlow embeddings initialization See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, T5DenseReluDense):
+            # Mesh TensorFlow FF initialization See
+            # https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi, 'bias') and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
+            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5DenseGatedGeluDense):
+            module.wi_0.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi_0, 'bias') and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_model)**-0.5))
+            if hasattr(module.wi_1, 'bias') and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(
+                mean=0.0, std=factor * ((self.config.d_ff)**-0.5))
+            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before
+            # softmax See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(
+                mean=0.0, std=factor * ((d_model * key_value_proj_dim)**-0.5))
+            module.k.weight.data.normal_(
+                mean=0.0, std=factor * (d_model**-0.5))
+            module.v.weight.data.normal_(
+                mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(
+                mean=0.0, std=factor * ((n_heads * key_value_proj_dim)**-0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(
+                    mean=0.0, std=factor * ((d_model)**-0.5))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (T5Attention, T5Stack)):
+            module.gradient_checkpointing = value
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            decoder_start_token_id is not None
+        ), 'self.model.config.decoder_start_token_id has to be defined.'
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1, ),
+                                           decoder_start_token_id)
+            shifted_input_ids = torch.cat(
+                [shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, 'self.model.config.pad_token_id has to be defined.'
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        assert torch.all(shifted_input_ids >= 0).item(
+        ), 'Verify that `shifted_input_ids` has only positive values'
+
+        return shifted_input_ids
+
+
+class T5Stack(T5PreTrainedModel):
+
+    def __init__(self, config, embed_tokens=None):
+        super().__init__(config)
+
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+
+        self.block = nn.ModuleList([
+            T5Block(config, has_relative_attention_bias=bool(i == 0))
+            for i in range(config.num_layers)
+        ])
+        self.final_layer_norm = T5LayerNorm(
+            config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.block))
+        self.model_parallel = True
+        self.first_device = 'cpu' if 'cpu' in self.device_map.keys(
+        ) else 'cuda:' + str(min(self.device_map.keys()))
+        self.last_device = 'cuda:' + str(max(self.device_map.keys()))
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for layer in v:
+                cuda_device = 'cuda:' + str(k)
+                self.block[layer] = self.block[layer].to(cuda_device)
+
+        # Set embed_tokens to first layer
+        self.embed_tokens = self.embed_tokens.to(self.first_device)
+        # Set final layer norm to last device
+        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = 'cpu'
+        self.last_device = 'cpu'
+        for i in range(len(self.block)):
+            self.block[i] = self.block[i].to('cpu')
+        self.embed_tokens = self.embed_tokens.to('cpu')
+        self.final_layer_norm = self.final_layer_norm.to('cpu')
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # Model parallel
+        if self.model_parallel:
+            torch.cuda.set_device(self.first_device)
+            self.embed_tokens = self.embed_tokens.to(self.first_device)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = 'decoder_' if self.is_decoder else ''
+            raise ValueError(
+                f'You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = 'decoder_' if self.is_decoder else ''
+            raise ValueError(
+                f'You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds'
+            )
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, 'You have to initialize the model with valid token embeddings'
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values[0][0].shape[
+            2] + seq_length if past_key_values is not None else seq_length
+
+        if use_cache is True:
+            assert self.is_decoder, f'`use_cache` can only be set to `True` if {self} is used as a decoder'
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length).to(
+                inputs_embeds.device)
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size,
+                encoder_seq_length,
+                device=inputs_embeds.device,
+                dtype=torch.long)
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape, inputs_embeds.device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask,
+                                                  self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions
+                                      and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module,
+                past_key_value) in enumerate(zip(self.block, past_key_values)):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if position_bias is not None:
+                    position_bias = position_bias.to(hidden_states.device)
+                if encoder_hidden_states is not None:
+                    encoder_hidden_states = encoder_hidden_states.to(
+                        hidden_states.device)
+                if encoder_extended_attention_mask is not None:
+                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+                        hidden_states.device)
+                if encoder_decoder_position_bias is not None:
+                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(
+                        hidden_states.device)
+                if layer_head_mask is not None:
+                    layer_head_mask = layer_head_mask.to(hidden_states.device)
+                if cross_attn_layer_head_mask is not None:
+                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(
+                        hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return tuple(
+                            module(*inputs, use_cache, output_attentions))
+
+                    return custom_forward
+
+                layer_outputs = checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with: hidden-states, key-value-states,
+            # (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (
+                    None, ) + layer_outputs[1:]
+
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer
+            # store them layer_outputs = hidden-states, key-value-states
+            # (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[
+                    4 if output_attentions else 3]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (
+                    present_key_value_state, )
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3], )
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[5], )
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and 'cuda:' + str(k) != self.last_device:
+                        hidden_states = hidden_states.to('cuda:' + str(k + 1))
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                present_key_value_states,
+                all_hidden_states,
+                all_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+T5_START_DOCSTRING = r"""
+
+    The T5 model was proposed in [Exploring the Limits of Transfer Learning with
+    a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by
+    Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
+    Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder
+    transformer pre-trained in a text-to-text denoising generative setting.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`T5Config`]): Model configuration class with all the parameters
+        of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model
+            with relative position embeddings so you should be able to pad the
+            inputs on both the right and the left.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a
+            look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            T5 uses the `pad_token_id` as the starting token for
+            `decoder_input_ids` generation. If `past_key_values` is used,
+            optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining
+            take a look at [T5 Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in
+            `decoder_input_ids`. Causal mask will also be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in
+                the decoder. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
+            `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
+            sequence_length, hidden_size)` is a sequence of hidden states at the
+            output of the last layer of the encoder. Used in the cross-attention
+            of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to
+            directly pass an embedded representation. If `past_key_values` is
+            used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more
+            control over how to convert `decoder_input_ids` indices into
+            associated vectors than the model's internal embedding lookup
+            matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
+            `decoder_inputs_embeds` takes the value of `inputs_embeds`.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
+            tuple.
+"""
+
+T5_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model
+            with relative position embeddings so you should be able to pad the
+            inputs on both the right and the left.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for detail.
+
+            To know more on how to prepare `input_ids` for pretraining take a
+            look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
+            tuple.
+"""
+
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and
+`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
+but this feature is deprecated and will be removed in future versions. If you do
+not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
+torch.ones(num_layers, num_heads)`.
+"""
+
+
+@add_start_docstrings(
+    'The bare T5 Model transformer outputting raw hidden-states without any specific head on top.',
+    T5_START_DOCSTRING,
+)
+class T5Model(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
+        heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import T5Tokenizer, T5Model
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5Model.from_pretrained("t5-small")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top.""",
+                      T5_START_DOCSTRING)
+class T5ForConditionalGeneration(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+        r'lm_head\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.decoder.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.lm_head = self.lm_head.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
+            labels set to `-100` are ignored (masked), the loss is only computed
+            for labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python >>> from transformers import T5Tokenizer,
+        T5ForConditionalGeneration
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        >>> # training
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+
+        >>> # inference
+        >>> input_ids = tokenizer(
+        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> outputs = model.generate(input_ids)
+        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        >>> # studies have shown that owning a dog is good for you.
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.encoder.first_device)
+            self.lm_head = self.lm_head.to(self.encoder.first_device)
+            sequence_output = sequence_output.to(self.lm_head.weight.device)
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        if not return_dict:
+            output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
+            return ((loss, ) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      head_mask=None,
+                                      decoder_head_mask=None,
+                                      cross_attn_head_mask=None,
+                                      use_cache=None,
+                                      encoder_outputs=None,
+                                      **kwargs):
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'decoder_input_ids': input_ids,
+            'past_key_values': past,
+            'encoder_outputs': encoder_outputs,
+            'attention_mask': attention_mask,
+            'head_mask': head_mask,
+            'decoder_head_mask': decoder_head_mask,
+            'cross_attn_head_mask': cross_attn_head_mask,
+            'use_cache': use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            logger.warning(
+                'You might want to consider setting `use_cache=True` to speed up decoding'
+            )
+            return past
+
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(
+                        0, beam_idx.to(layer_past_state.device)), )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[
+                0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + (
+                reordered_layer_past_states, )
+        return reordered_decoder_past
+
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
+    T5_START_DOCSTRING,
+)
+class T5EncoderModel(T5PreTrainedModel):
+    authorized_missing_keys = [
+        r'encoder\.embed_tokens\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
+        heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import T5Tokenizer, T5EncoderModel
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5EncoderModel.from_pretrained("t5-small")
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
diff --git a/modelscope/models/nlp/T5/t5_for_text_generation.py b/modelscope/models/nlp/T5/t5_for_text_generation.py
new file mode 100644
index 00000000..27f077d8
--- /dev/null
+++ b/modelscope/models/nlp/T5/t5_for_text_generation.py
@@ -0,0 +1,56 @@
+from typing import Optional, Tuple
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from .modeling_t5 import T5Config
+from .modeling_t5 import T5ForConditionalGeneration as T5ForGeneration
+
+
+@MODELS.register_module(
+    group_key=Tasks.text2text_generation,
+    module_name=Models.T5,
+)
+class T5ForConditionalGeneration(TorchModel):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        """initialize the text generation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            model_cls (Optional[Any], optional): model loader, if None, use the
+                default loader to load model weights, by default None.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model = T5ForGeneration.from_pretrained(model_dir)
+        self.generate = self.model.generate
+        self.config = self.model.config
+
+    def forward(self,
+                input_ids: Optional[torch.LongTensor] = None,
+                attention_mask: Optional[torch.FloatTensor] = None,
+                decoder_input_ids: Optional[torch.LongTensor] = None,
+                decoder_attention_mask: Optional[torch.BoolTensor] = None,
+                head_mask: Optional[torch.FloatTensor] = None,
+                decoder_head_mask: Optional[torch.FloatTensor] = None,
+                cross_attn_head_mask: Optional[torch.Tensor] = None,
+                encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                **kwargs):
+        return self.model.forward(
+            self, input_ids, attention_mask, decoder_input_ids,
+            decoder_attention_mask, head_mask, decoder_head_mask,
+            cross_attn_head_mask, encoder_outputs, past_key_values,
+            inputs_embeds, decoder_inputs_embeds, labels, use_cache,
+            output_attentions, output_hidden_states, return_dict, **kwargs)
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 443cb214..152a32dc 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
     from .token_classification import SbertForTokenClassification
     from .sentence_embedding import SentenceEmbedding
     from .passage_ranking import PassageRanking
-
+    from .T5 import T5ForConditionalGeneration
 else:
     _import_structure = {
         'backbones': ['SbertModel'],
@@ -68,6 +68,7 @@ else:
         'table_question_answering': ['TableQuestionAnswering'],
         'sentence_embedding': ['SentenceEmbedding'],
         'passage_ranking': ['PassageRanking'],
+        'T5': ['T5ForConditionalGeneration'],
     }
 
     import sys
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index b3eb9ad8..a80cbf33 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -390,12 +390,19 @@ TASK_OUTPUTS = {
     Tasks.text_error_correction: [OutputKeys.OUTPUT],
     Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
     Tasks.passage_ranking: [OutputKeys.SCORES],
+
     # text generation result for single sample
     # {
     #   "text": "this is the text generated by a model."
     # }
     Tasks.text_generation: [OutputKeys.TEXT],
 
+    # text generation result for single sample
+    # {
+    #   "text": "北京"
+    # }
+    Tasks.text2text_generation: [OutputKeys.TEXT],
+
     # fill mask result for single sample
     # {
     #   "text": "this is the text which masks filled by model."
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index b5c53f82..a8edc21a 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
     from .fill_mask_pipeline import FillMaskPipeline
-    from .fill_mask_ponet_pipeline import FillMaskPoNetPreprocessor
+    from .fill_mask_ponet_pipeline import FillMaskPonetPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
@@ -22,6 +22,7 @@ if TYPE_CHECKING:
     from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
     from .text_generation_pipeline import TextGenerationPipeline
+    from .text2text_generation_pipeline import Text2TextGenerationPipeline
     from .token_classification_pipeline import TokenClassificationPipeline
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline
@@ -54,6 +55,7 @@ else:
         'text_classification_pipeline': ['TextClassificationPipeline'],
         'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
         'text_generation_pipeline': ['TextGenerationPipeline'],
+        'text2text_generation_pipeline': ['Text2TextGenerationPipeline'],
         'token_classification_pipeline': ['TokenClassificationPipeline'],
         'translation_pipeline': ['TranslationPipeline'],
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
new file mode 100644
index 00000000..9ccd00f4
--- /dev/null
+++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
@@ -0,0 +1,87 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import Text2TextGenerationPreprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['Text2TextGenerationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.text2text_generation)
+class Text2TextGenerationPipeline(Pipeline):
+
+    def __init__(
+            self,
+            model: Union[Model, str],
+            preprocessor: Optional[Text2TextGenerationPreprocessor] = None,
+            first_sequence='sentence',
+            **kwargs):
+        """Use `model` and `preprocessor` to create a text to text generation pipeline for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported the text generation task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            first_sequence: The key to read the first sentence in.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+
+            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            param will have no effect.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(task='text-generation',
+            >>>    model='damo/nlp_palm2.0_text-generation_chinese-base')
+            >>> sentence1 = '本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：'
+            >>>     '1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代'
+            >>> print(pipeline_ins(sentence1))
+            >>> # Or use the dict input:
+            >>> print(pipeline_ins({'sentence': sentence1}))
+
+            To view other examples plese check the tests/pipelines/test_text_generation.py.
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = Text2TextGenerationPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 128))
+        self.tokenizer = preprocessor.tokenizer
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+
+        forward_params['min_length'] = forward_params.get(
+            'min_length', self.model.config.min_length)
+        forward_params['max_length'] = forward_params.get(
+            'max_length', self.model.config.max_length)
+
+        with torch.no_grad():
+            output_ids = self.model.generate(**inputs, **forward_params)
+            return {'output_ids': output_ids}
+
+    def postprocess(self, inputs: Dict[str, Tensor],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        output = self.tokenizer.decode(
+            inputs['output_ids'][0],
+            skip_special_tokens=True,
+        )
+        return {OutputKeys.TEXT: output}
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index ba03a35e..e37b3324 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
         TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
         SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
         DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor,
-        PassageRankingPreprocessor,
+        PassageRankingPreprocessor, Text2TextGenerationPreprocessor,
         WordSegmentationBlankSetToLabelPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
@@ -57,6 +57,7 @@ else:
             'TextErrorCorrectionPreprocessor',
             'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
             'RelationExtractionPreprocessor',
+            'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
         ],
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index eee5e80f..f305df27 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
         Tokenize, SequenceClassificationPreprocessor,
         TextGenerationPreprocessor, TokenClassificationPreprocessor,
         SingleSentenceClassificationPreprocessor,
+        Text2TextGenerationPreprocessor,
         PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
         ZeroShotClassificationPreprocessor, NERPreprocessor,
         FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor,
@@ -27,6 +28,7 @@ else:
             'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
             'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
             'RelationExtractionPreprocessor',
+            'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
         ],
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 0a2495af..d294f517 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -26,6 +26,7 @@ __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
     'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
     'PairSentenceClassificationPreprocessor',
+    'Text2TextGenerationPreprocessor',
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
     'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
@@ -442,6 +443,40 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         return features
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
+class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        self.tokenizer = self.build_tokenizer(
+            model_dir) if tokenizer is None else tokenizer
+        kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
+        kwargs['padding'] = kwargs.get('padding', False)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        text_a, _, _ = self.parse_text_and_label(data)
+
+        inputs = self.tokenizer(
+            text_a,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+
+        # This is produced by tokenizers but is an invalid generate kwargs
+        if 'token_type_ids' in inputs:
+            del inputs['token_type_ids']
+        return inputs
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
 class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index d6b0da40..4c5d2f41 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -97,6 +97,7 @@ class NLPTasks(object):
     token_classification = 'token-classification'
     conversational = 'conversational'
     text_generation = 'text-generation'
+    text2text_generation = 'text2text-generation'
     task_oriented_conversation = 'task-oriented-conversation'
     dialog_intent_prediction = 'dialog-intent-prediction'
     dialog_state_tracking = 'dialog-state-tracking'
diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
new file mode 100644
index 00000000..04cecf93
--- /dev/null
+++ b/tests/pipelines/test_text2text_generation.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import T5ForConditionalGeneration
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import Text2TextGenerationPipeline
+from modelscope.preprocessors import Text2TextGenerationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/t5-cn-base-test'
+        self.input = '中国的首都位于<extra_id_0>。'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_T5(self):
+        cache_path = snapshot_download(self.model_id)
+        model = T5ForConditionalGeneration(cache_path)
+        preprocessor = Text2TextGenerationPreprocessor(cache_path)
+        pipeline1 = Text2TextGenerationPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.text2text_generation, model=model, preprocessor=preprocessor)
+        print(
+            f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}'
+        )
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline_with_model_instance(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = Text2TextGenerationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.text2text_generation,
+            model=model,
+            preprocessor=preprocessor)
+        print(pipeline_ins(self.input))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline_with_model_id(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text2text_generation, model=self.model_id)
+        print(pipeline_ins(self.input))
+
+    @unittest.skip(
+        'only for test cases, there is no default official model yet')
+    def test_run_pipeline_without_model_id(self):
+        pipeline_ins = pipeline(task=Tasks.text2text_generation)
+        print(pipeline_ins(self.input))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 4dbdc45963a769d43afe2d75c1ebc7964c359c9d Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Mon, 26 Sep 2022 13:23:32 +0800
Subject: [PATCH 03/23] test(data): add test data         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10246518

---
 data/test/videos/Walking.54138969.mp4 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/test/videos/Walking.54138969.mp4 b/data/test/videos/Walking.54138969.mp4
index 1716695f..d4355290 100644
--- a/data/test/videos/Walking.54138969.mp4
+++ b/data/test/videos/Walking.54138969.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b8f50a0537bfe7e082c5ad91b2b7ece61a0adbeb7489988e553909276bf920c
-size 44217644
+oid sha256:7663f9a32ea57086bf66c4b9e9ebe0fd418986c67716c7be02ca917e72ddc0ba
+size 8155895

From b876839d51b81a14e6caaba87d6fb0c9f646a0c8 Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Mon, 26 Sep 2022 14:03:35 +0800
Subject: [PATCH 04/23] [to #42322933]adjust output form

adjust output form for movie scene segmentation demo

 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10244194
---
 .../models/cv/movie_scene_segmentation/model.py     |  4 ++--
 .../cv/movie_scene_segmentation/utils/save_op.py    | 13 ++++++-------
 modelscope/outputs.py                               | 11 +++++------
 .../cv/movie_scene_segmentation_pipeline.py         |  4 ++--
 4 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py
index 676b5ac1..1232d427 100644
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -162,11 +162,11 @@ class MovieSceneSegmentationModel(TorchModel):
         thres = self.cfg.pipeline.save_threshold
 
         anno_dict = get_pred_boundary(pred_dict, thres)
-        scene_dict, scene_list = pred2scene(self.shot2keyf, anno_dict)
+        scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict)
         if self.cfg.pipeline.save_split_scene:
             re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
             print(f'Split scene video saved to {re_dir}')
-        return len(scene_list), scene_dict
+        return len(scene_list), scene_dict_lst
 
     def preprocess(self, inputs):
         logger.info('Begin shot detect......')
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
index cf26d21a..6361c056 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -21,16 +21,15 @@ def get_pred_boundary(pred_dict, threshold=0.5):
 def pred2scene(shot2keyf, anno_dict):
     scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)
 
-    scene_dict = {}
+    scene_dict_lst = []
     assert len(scene_list) == len(pair_list)
     for scene_ind, scene_item in enumerate(scene_list):
-        scene_dict.update(
-            {scene_ind: {
-                'shot': pair_list[scene_ind],
-                'frame': scene_item
-            }})
+        scene_dict_lst.append({
+            'shot': pair_list[scene_ind],
+            'frame': scene_item
+        })
 
-    return scene_dict, scene_list
+    return scene_dict_lst, scene_list
 
 
 def scene2video(source_movie_fn, scene_list, thres):
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index a80cbf33..052d4f33 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -38,7 +38,7 @@ class OutputKeys(object):
     HISTORY = 'history'
     TIMESTAMPS = 'timestamps'
     SPLIT_VIDEO_NUM = 'split_video_num'
-    SPLIT_META_DICT = 'split_meta_dict'
+    SPLIT_META_LIST = 'split_meta_list'
 
 
 TASK_OUTPUTS = {
@@ -293,18 +293,17 @@ TASK_OUTPUTS = {
     # movide scene segmentation result for a single video
     # {
     #        "split_video_num":3,
-    #        "split_meta_dict":
-    #        {
-    #           scene_id:
+    #        "split_meta_list":
+    #        [
     #           {
     #               "shot": [0,1,2],
     #               "frame": [start_frame, end_frame]
     #           }
-    #        }
+    #        ]
     #
     # }
     Tasks.movie_scene_segmentation:
-    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_DICT],
+    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST],
 
     # ============ nlp tasks ===================
 
diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
index b5acf17a..6704e4c0 100644
--- a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -60,9 +60,9 @@ class MovieSceneSegmentationPipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         data = {'input_video_pth': self.input_video_pth, 'feat': inputs}
-        video_num, meta_dict = self.model.postprocess(data)
+        video_num, meta_lst = self.model.postprocess(data)
         result = {
             OutputKeys.SPLIT_VIDEO_NUM: video_num,
-            OutputKeys.SPLIT_META_DICT: meta_dict
+            OutputKeys.SPLIT_META_LIST: meta_lst
         }
         return result

From bd4127bc27120f460f90f5f75832d8d3830e5b06 Mon Sep 17 00:00:00 2001
From: "tianchu.gtc" <tianchu.gtc@alibaba-inc.com>
Date: Mon, 26 Sep 2022 15:49:35 +0800
Subject: [PATCH 05/23] =?UTF-8?q?[to=20#42322933]segformer=20=E6=8E=A5?=
 =?UTF-8?q?=E5=85=A5demo=E6=8E=A5=E5=8F=A3=E6=9B=B4=E6=94=B9=20=20=20=20?=
 =?UTF-8?q?=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/Ma?=
 =?UTF-8?q?aS-lib/codereview/10253628?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../easycv_pipelines/segmentation_pipeline.py | 24 ++++++++++++++
 .../test_segmentation_pipeline.py             | 32 ++++++++++---------
 2 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
index 2182e3b3..bd09fc9b 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
@@ -1,5 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any
+
+import numpy as np
+
 from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import Tasks
 from .base import EasyCVPipeline
@@ -21,3 +26,22 @@ class EasyCVSegmentationPipeline(EasyCVPipeline):
             model_file_pattern=model_file_pattern,
             *args,
             **kwargs)
+
+    def __call__(self, inputs) -> Any:
+        outputs = self.predict_op(inputs)
+
+        semantic_result = outputs[0]['seg_pred']
+
+        ids = np.unique(semantic_result)[::-1]
+        legal_indices = ids != len(self.predict_op.CLASSES)  # for VOID label
+        ids = ids[legal_indices]
+        segms = (semantic_result[None] == ids[:, None, None])
+        masks = [it.astype(np.int) for it in segms]
+        labels_txt = np.array(self.predict_op.CLASSES)[ids].tolist()
+
+        results = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.LABELS: labels_txt,
+            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
+        }
+        return results
diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
index 80ab36a6..5f6dac4b 100644
--- a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -2,30 +2,34 @@
 import unittest
 from distutils.version import LooseVersion
 
+import cv2
 import easycv
 import numpy as np
 from PIL import Image
 
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import semantic_seg_masks_to_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class EasyCVSegmentationPipelineTest(unittest.TestCase):
-
+class EasyCVSegmentationPipelineTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
     img_path = 'data/test/images/image_segmentation.jpg'
 
-    def _internal_test_(self, model_id):
-        img = np.asarray(Image.open(self.img_path))
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
 
+    def _internal_test_(self, model_id):
         semantic_seg = pipeline(task=Tasks.image_segmentation, model=model_id)
         outputs = semantic_seg(self.img_path)
 
-        self.assertEqual(len(outputs), 1)
-
-        results = outputs[0]
-        self.assertListEqual(
-            list(img.shape)[:2], list(results['seg_pred'].shape))
+        draw_img = semantic_seg_masks_to_image(outputs[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test ' + model_id + ' DONE')
 
     def _internal_test_batch_(self, model_id, num_samples=2, batch_size=2):
         # TODO: support in the future
@@ -49,37 +53,35 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):
     def test_segformer_b0(self):
         model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b1(self):
         model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b2(self):
         model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b3(self):
         model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b4(self):
         model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b5(self):
         model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k'
         self._internal_test_(model_id)
-        self._internal_test_batch_(model_id)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
 
 
 if __name__ == '__main__':

From f844f73b03ed5c47ef6e32ec9359c8984af8a02a Mon Sep 17 00:00:00 2001
From: "leyuan.hjy" <leyuan.hjy@alibaba-inc.com>
Date: Mon, 26 Sep 2022 15:52:03 +0800
Subject: [PATCH 06/23] =?UTF-8?q?[to=20#42322933]=E4=BF=AE=E5=A4=8Dnano?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=88=9D=E5=A7=8B=E5=8C=96/=E5=A2=9E?=
 =?UTF-8?q?=E5=8A=A0=E6=96=87=E4=BB=B6copyright=E4=BF=A1=E6=81=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复nano模型初始化/增加文件copyright信息
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10247456
---
 .../cv/realtime_object_detection/realtime_detector.py      | 7 ++++++-
 .../yolox/exp/default/yolox_nano.py                        | 3 ++-
 .../pipelines/cv/realtime_object_detection_pipeline.py     | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/cv/realtime_object_detection/realtime_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
index b147f769..2b4b3f8c 100644
--- a/modelscope/models/cv/realtime_object_detection/realtime_detector.py
+++ b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import argparse
 import logging as logger
 import os
@@ -48,6 +49,7 @@ class RealtimeDetector(TorchModel):
         self.nmsthre = self.exp.nmsthre
         self.test_size = self.exp.test_size
         self.preproc = ValTransform(legacy=False)
+        self.label_mapping = self.config['labels']
 
     def inference(self, img):
         with torch.no_grad():
@@ -81,5 +83,8 @@ class RealtimeDetector(TorchModel):
             bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
             scores = outputs[0][:, 5].cpu().numpy()
             labels = outputs[0][:, 6].cpu().int().numpy()
+            pred_label_names = []
+            for lab in labels:
+                pred_label_names.append(self.label_mapping[lab])
 
-        return bboxes, scores, labels
+        return bboxes, scores, pred_label_names
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
index 330eef16..7bada485 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
@@ -42,5 +42,6 @@ class YoloXNanoExp(YoloXExp):
                 act=self.act,
                 depthwise=True)
             self.model = YOLOX(backbone, head)
-
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
         return self.model
diff --git a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
index 629720d1..9f558f88 100644
--- a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
+++ b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict, List, Union
 

From 65cce5b9976db9873ceb3fa1687903546f679e0d Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 26 Sep 2022 16:12:17 +0800
Subject: [PATCH 07/23]  [to #44902165] bump version to 0.4.5

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index 9a8e054a..68eb9b68 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.4'
+__version__ = '0.4.5'

From c498d88d48a8c8cdd85c963322795914dabc9f42 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 26 Sep 2022 17:38:13 +0800
Subject: [PATCH 08/23] [to #42322933] add license declaration

1. add license declaration
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10216802
---
 .../metrics/sequence_classification_metric.py |  2 ++
 modelscope/metrics/text_generation_metric.py  |  2 ++
 .../metrics/token_classification_metric.py    |  2 ++
 .../models/multi_modal/mplug/clip/__init__.py |  2 ++
 .../models/multi_modal/mplug/predictor.py     | 16 +++++++++++++
 .../models/multi_modal/mplug_for_all_tasks.py |  2 ++
 modelscope/models/nlp/backbones/structbert.py |  1 +
 .../nlp/bart_for_text_error_correction.py     |  1 +
 .../nlp/bert_for_sequence_classification.py   |  1 +
 .../models/nlp/csanmt_for_translation.py      |  3 +++
 .../nlp/gpt3/gpt3_for_text_generation.py      |  1 +
 modelscope/models/nlp/gpt3/modeling_gpt3.py   |  1 +
 .../nlp/heads/infromation_extraction_head.py  |  5 +---
 .../nlp/heads/sequence_classification_head.py |  1 +
 .../nlp/heads/token_classification_head.py    |  1 +
 .../models/nlp/heads/torch_pretrain_head.py   |  1 +
 modelscope/models/nlp/masked_language.py      |  3 +--
 .../nlp/nncrf_for_named_entity_recognition.py |  6 +++--
 .../models/nlp/palm_v2/modeling_palm.py       | 16 +++++++++++++
 .../nlp/palm_v2/palm_for_text_generation.py   |  1 +
 modelscope/models/nlp/passage_ranking.py      |  2 ++
 modelscope/models/nlp/sentence_embedding.py   |  4 ++--
 .../models/nlp/sequence_classification.py     |  2 ++
 .../nlp/task_models/information_extraction.py |  5 +---
 .../task_models/sequence_classification.py    |  1 +
 .../models/nlp/task_models/task_model.py      |  1 +
 .../nlp/task_models/token_classification.py   |  1 +
 modelscope/models/nlp/token_classification.py |  2 ++
 .../nlp/dialog_state_tracking_pipeline.py     |  2 ++
 .../nlp/distributed_plug_pipeline.py          |  2 ++
 .../nlp/faq_question_answering_pipeline.py    |  2 ++
 .../pipelines/nlp/fill_mask_pipeline.py       |  2 ++
 .../nlp/information_extraction_pipeline.py    |  5 ++--
 .../nlp/named_entity_recognition_pipeline.py  |  2 ++
 .../pair_sentence_classification_pipeline.py  |  2 ++
 .../pipelines/nlp/passage_ranking_pipeline.py |  2 ++
 .../nlp/sentence_embedding_pipeline.py        |  2 ++
 .../sequence_classification_pipeline_base.py  |  2 ++
 ...single_sentence_classification_pipeline.py |  2 ++
 .../nlp/text_error_correction_pipeline.py     |  2 ++
 .../pipelines/nlp/text_generation_pipeline.py |  2 ++
 .../nlp/token_classification_pipeline.py      |  2 ++
 .../pipelines/nlp/translation_pipeline.py     |  2 ++
 .../nlp/word_segmentation_pipeline.py         |  2 ++
 .../nlp/zero_shot_classification_pipeline.py  |  2 ++
 modelscope/preprocessors/__init__.py          |  3 ++-
 modelscope/preprocessors/nlp/__init__.py      |  1 +
 modelscope/preprocessors/nlp/nlp_base.py      | 24 ++++++++++++-------
 .../nlp/csanmt_translation_trainer.py         |  2 ++
 .../trainers/nlp/passage_ranking_trainer.py   |  2 ++
 .../nlp/sequence_classification_trainer.py    |  2 ++
 .../nlp/space/dialog_intent_trainer.py        |  2 ++
 .../nlp/space/dialog_modeling_trainer.py      |  2 ++
 .../nlp/space/metrics/metrics_tracker.py      |  4 +---
 modelscope/trainers/nlp_trainer.py            |  2 ++
 55 files changed, 139 insertions(+), 28 deletions(-)

diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index d795d8a2..51a829ef 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Dict
 
 import numpy as np
diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py
index 6bdcbc58..f154281d 100644
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Dict
 
 from modelscope.metainfo import Metrics
diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py
index 53d13b6a..05b72170 100644
--- a/modelscope/metrics/token_classification_metric.py
+++ b/modelscope/metrics/token_classification_metric.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import importlib
 from typing import Dict, List, Optional, Union
 
diff --git a/modelscope/models/multi_modal/mplug/clip/__init__.py b/modelscope/models/multi_modal/mplug/clip/__init__.py
index 05826f46..e6007a04 100644
--- a/modelscope/models/multi_modal/mplug/clip/__init__.py
+++ b/modelscope/models/multi_modal/mplug/clip/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .clip import load_from_config
diff --git a/modelscope/models/multi_modal/mplug/predictor.py b/modelscope/models/multi_modal/mplug/predictor.py
index c976baa1..6375d1d7 100755
--- a/modelscope/models/multi_modal/mplug/predictor.py
+++ b/modelscope/models/multi_modal/mplug/predictor.py
@@ -1,3 +1,19 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function
 
 import torch
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index d61fea10..64a7dd7b 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Dict, List
 
diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py
index f47900c3..74735520 100644
--- a/modelscope/models/nlp/backbones/structbert.py
+++ b/modelscope/models/nlp/backbones/structbert.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import BACKBONES
diff --git a/modelscope/models/nlp/bart_for_text_error_correction.py b/modelscope/models/nlp/bart_for_text_error_correction.py
index 2339f221..27abedb5 100644
--- a/modelscope/models/nlp/bart_for_text_error_correction.py
+++ b/modelscope/models/nlp/bart_for_text_error_correction.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/nlp/bert_for_sequence_classification.py b/modelscope/models/nlp/bert_for_sequence_classification.py
index 75105f36..2b1a3b3b 100644
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ b/modelscope/models/nlp/bert_for_sequence_classification.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/models/nlp/csanmt_for_translation.py b/modelscope/models/nlp/csanmt_for_translation.py
index 83b58060..4bac8e6d 100644
--- a/modelscope/models/nlp/csanmt_for_translation.py
+++ b/modelscope/models/nlp/csanmt_for_translation.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from THUMT,
+# publicly available at https://github.com/THUNLP-MT/THUMT
+# Copyright 2017-2022 The Alibaba MT Team Authors. All rights reserved.
 import math
 from collections import namedtuple
 from typing import Dict
diff --git a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
index fe1402e8..d686ea30 100644
--- a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
+++ b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 from modelscope.metainfo import Models
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index 69e9ba7c..498d15de 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/heads/infromation_extraction_head.py b/modelscope/models/nlp/heads/infromation_extraction_head.py
index cf957834..6c3388f0 100644
--- a/modelscope/models/nlp/heads/infromation_extraction_head.py
+++ b/modelscope/models/nlp/heads/infromation_extraction_head.py
@@ -1,13 +1,10 @@
-from typing import Dict
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
-import torch.nn.functional as F
 from torch import nn
 
 from modelscope.metainfo import Heads
 from modelscope.models.base import TorchHead
 from modelscope.models.builder import HEADS
-from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 
 
diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py
index e608f035..fb03b7ff 100644
--- a/modelscope/models/nlp/heads/sequence_classification_head.py
+++ b/modelscope/models/nlp/heads/sequence_classification_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 import torch
diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py
index 481524ae..ace3deac 100644
--- a/modelscope/models/nlp/heads/token_classification_head.py
+++ b/modelscope/models/nlp/heads/token_classification_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 import torch
diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py
index 6ff6c96f..fb54637b 100644
--- a/modelscope/models/nlp/heads/torch_pretrain_head.py
+++ b/modelscope/models/nlp/heads/torch_pretrain_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 
 import torch
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
index 4f466c23..514a04cd 100644
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -1,6 +1,5 @@
-from typing import Any, Dict, Optional, Union
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
-import numpy as np
 from transformers import BertForMaskedLM as BertForMaskedLMTransformer
 
 from modelscope.metainfo import Models
diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
index 37216510..62198ed2 100644
--- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
@@ -1,3 +1,7 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. All rights reserved.
+# The CRF implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp)
+# and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications.
+
 import os
 from typing import Any, Dict, List, Optional
 
@@ -208,8 +212,6 @@ class CRF(nn.Module):
        Learning*. Morgan Kaufmann. pp. 282–289.
     .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
 
-    The implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp)
-    and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications.
     """
 
     def __init__(self, num_tags: int, batch_first: bool = False) -> None:
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py
index 99b00454..f395ebd4 100644
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -1,3 +1,19 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import codecs
 import copy
 import math
diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
index ae92427e..2c37afd6 100644
--- a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict, List
 
 from modelscope.metainfo import Models
diff --git a/modelscope/models/nlp/passage_ranking.py b/modelscope/models/nlp/passage_ranking.py
index 68bca231..2a06ce45 100644
--- a/modelscope/models/nlp/passage_ranking.py
+++ b/modelscope/models/nlp/passage_ranking.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 import numpy as np
diff --git a/modelscope/models/nlp/sentence_embedding.py b/modelscope/models/nlp/sentence_embedding.py
index 955c0e53..340c133f 100644
--- a/modelscope/models/nlp/sentence_embedding.py
+++ b/modelscope/models/nlp/sentence_embedding.py
@@ -1,7 +1,7 @@
-import os
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
-import json
 import numpy as np
 
 from modelscope.metainfo import Models
diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py
index e8802dbd..a8930e68 100644
--- a/modelscope/models/nlp/sequence_classification.py
+++ b/modelscope/models/nlp/sequence_classification.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from abc import abstractmethod
 
 from torch import nn
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
index 20a44787..4792d07c 100644
--- a/modelscope/models/nlp/task_models/information_extraction.py
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -1,7 +1,7 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import numpy as np
-import torch
 
 from modelscope.metainfo import TaskModels
 from modelscope.models.builder import MODELS
@@ -9,9 +9,6 @@ from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
 
 __all__ = ['InformationExtractionModel']
 
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
index 80bfd476..43a96327 100644
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py
index 104b4c32..e93dd5f6 100644
--- a/modelscope/models/nlp/task_models/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path
 import re
 from abc import ABC
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index 29679838..5c22098f 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import numpy as np
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
index 0be921d0..c3723a61 100644
--- a/modelscope/models/nlp/token_classification.py
+++ b/modelscope/models/nlp/token_classification.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from abc import abstractmethod
 from typing import Dict
 
diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
index 0d2c96d7..79d32ace 100644
--- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
index 202e6213..e5c05e86 100644
--- a/modelscope/pipelines/nlp/distributed_plug_pipeline.py
+++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict
 
 import torch
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
index 65831a17..1d46d8fd 100644
--- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index db6b61c6..12f4b80f 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict, Optional, Union
 
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
index 4cb138d6..07223d07 100644
--- a/modelscope/pipelines/nlp/information_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -1,11 +1,12 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import (Preprocessor,
                                       RelationExtractionPreprocessor)
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 8fbdde86..467d7aba 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
index 5248db8c..bdb75c73 100644
--- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Union
 
 from modelscope.models.base import Model
diff --git a/modelscope/pipelines/nlp/passage_ranking_pipeline.py b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
index c03e7b93..1d818ac0 100644
--- a/modelscope/pipelines/nlp/passage_ranking_pipeline.py
+++ b/modelscope/pipelines/nlp/passage_ranking_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
index 3ef6d06b..16dedb2e 100644
--- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
index 28bbc732..3d8e8fea 100644
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 import numpy as np
diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
index 844c6839..0a2f6d25 100644
--- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Union
 
 from ...metainfo import Pipelines
diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
index b63d8d36..8e9bf85d 100644
--- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py
+++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 3d27ffa9..ea35763f 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 804f8146..aabf48d8 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index e4893577..eb7f7f74 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 7e8b22bc..9d4bb67f 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index 38c0ee77..fc7051c7 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, Union
 
 import torch
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index e37b3324..b4be1845 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -24,7 +24,8 @@ if TYPE_CHECKING:
         TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
         SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
         DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor,
-        PassageRankingPreprocessor, Text2TextGenerationPreprocessor,
+        PassageRankingPreprocessor, SentenceEmbeddingPreprocessor,
+        Text2TextGenerationPreprocessor,
         WordSegmentationBlankSetToLabelPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index f305df27..8e75ae98 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -15,6 +15,7 @@ if TYPE_CHECKING:
         FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor,
         RelationExtractionPreprocessor, DocumentSegmentationPreprocessor,
         FillMaskPoNetPreprocessor, PassageRankingPreprocessor,
+        SentenceEmbeddingPreprocessor,
         WordSegmentationBlankSetToLabelPreprocessor)
 
 else:
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index d294f517..d6325eed 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -23,16 +23,24 @@ from modelscope.utils.type_assert import type_assert
 logger = get_logger()
 
 __all__ = [
-    'Tokenize', 'SequenceClassificationPreprocessor',
-    'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
+    'Tokenize',
+    'SequenceClassificationPreprocessor',
+    'TextGenerationPreprocessor',
+    'TokenClassificationPreprocessor',
     'PairSentenceClassificationPreprocessor',
     'Text2TextGenerationPreprocessor',
-    'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
-    'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-    'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
-    'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
-    'RelationExtractionPreprocessor', 'DocumentSegmentationPreprocessor',
-    'FillMaskPoNetPreprocessor'
+    'SingleSentenceClassificationPreprocessor',
+    'FillMaskPreprocessor',
+    'ZeroShotClassificationPreprocessor',
+    'NERPreprocessor',
+    'SentenceEmbeddingPreprocessor',
+    'PassageRankingPreprocessor',
+    'FaqQuestionAnsweringPreprocessor',
+    'SequenceLabelingPreprocessor',
+    'RelationExtractionPreprocessor',
+    'DocumentSegmentationPreprocessor',
+    'FillMaskPoNetPreprocessor',
+    'WordSegmentationBlankSetToLabelPreprocessor',
 ]
 
 
diff --git a/modelscope/trainers/nlp/csanmt_translation_trainer.py b/modelscope/trainers/nlp/csanmt_translation_trainer.py
index 62ae91a8..c93599c7 100644
--- a/modelscope/trainers/nlp/csanmt_translation_trainer.py
+++ b/modelscope/trainers/nlp/csanmt_translation_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Dict, Optional
 
diff --git a/modelscope/trainers/nlp/passage_ranking_trainer.py b/modelscope/trainers/nlp/passage_ranking_trainer.py
index e54c2904..711fd0c4 100644
--- a/modelscope/trainers/nlp/passage_ranking_trainer.py
+++ b/modelscope/trainers/nlp/passage_ranking_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import time
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
diff --git a/modelscope/trainers/nlp/sequence_classification_trainer.py b/modelscope/trainers/nlp/sequence_classification_trainer.py
index 64fd59b4..ec46e037 100644
--- a/modelscope/trainers/nlp/sequence_classification_trainer.py
+++ b/modelscope/trainers/nlp/sequence_classification_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import time
 from typing import Dict, Optional, Tuple, Union
 
diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
index c559ee5b..2e59cd80 100644
--- a/modelscope/trainers/nlp/space/dialog_intent_trainer.py
+++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import time
 from typing import Callable, Dict, Optional, Tuple, Union
diff --git a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
index 6bdd8a3a..726404d4 100644
--- a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
+++ b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import time
 from typing import Callable, Dict, Optional, Tuple, Union
diff --git a/modelscope/trainers/nlp/space/metrics/metrics_tracker.py b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
index 865600d3..340077a6 100644
--- a/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
+++ b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py
@@ -1,6 +1,4 @@
-"""
-MetricsTracker class
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import math
 from collections import defaultdict
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index 4a14be31..b54aa666 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Callable, Optional, Tuple, Union
 

From c8be0e8b7837ef4d31c8a8c33d9238b0516a5d15 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 27 Sep 2022 09:45:19 +0800
Subject: [PATCH 09/23] [to #44902165] remove device placement for image
 cartoon to avoid full gpu memory usage

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10260495
---
 modelscope/pipelines/cv/image_cartoon_pipeline.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index 72fda989..787aa06d 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -37,15 +37,12 @@ class ImageCartoonPipeline(Pipeline):
             model: model id on modelscope hub.
         """
         super().__init__(model=model, **kwargs)
-        with device_placement(self.framework, self.device_name):
-            self.facer = FaceAna(self.model)
-            with tf.Graph().as_default():
-                self.sess_anime_head = self.load_sess(
-                    os.path.join(self.model, 'cartoon_h.pb'),
-                    'model_anime_head')
-                self.sess_anime_bg = self.load_sess(
-                    os.path.join(self.model, 'cartoon_bg.pb'),
-                    'model_anime_bg')
+        self.facer = FaceAna(self.model)
+        with tf.Graph().as_default():
+            self.sess_anime_head = self.load_sess(
+                os.path.join(self.model, 'cartoon_h.pb'), 'model_anime_head')
+            self.sess_anime_bg = self.load_sess(
+                os.path.join(self.model, 'cartoon_bg.pb'), 'model_anime_bg')
 
         self.box_width = 288
         global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg'))

From 26df8f198820c3c079e38c8fdb94c2fd4d836581 Mon Sep 17 00:00:00 2001
From: "wendi.hwd" <wendi.hwd@alibaba-inc.com>
Date: Tue, 27 Sep 2022 15:01:05 +0800
Subject: [PATCH 10/23] [to #42322933]add semantic-segmentation task output is
 numpy mask for demo-service         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10265856

---
 modelscope/models/cv/salient_detection/salient_model.py   | 3 ++-
 modelscope/outputs.py                                     | 6 ++++++
 .../pipelines/cv/image_salient_detection_pipeline.py      | 8 ++------
 modelscope/utils/constant.py                              | 1 +
 tests/pipelines/test_salient_detection.py                 | 5 ++---
 5 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/modelscope/models/cv/salient_detection/salient_model.py b/modelscope/models/cv/salient_detection/salient_model.py
index 6e617f58..73c3c3fb 100644
--- a/modelscope/models/cv/salient_detection/salient_model.py
+++ b/modelscope/models/cv/salient_detection/salient_model.py
@@ -14,7 +14,8 @@ from modelscope.utils.constant import ModelFile, Tasks
 from .models import U2NET
 
 
-@MODELS.register_module(Tasks.image_segmentation, module_name=Models.detection)
+@MODELS.register_module(
+    Tasks.semantic_segmentation, module_name=Models.detection)
 class SalientDetection(TorchModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 052d4f33..b19f7e43 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -151,6 +151,12 @@ TASK_OUTPUTS = {
     Tasks.image_segmentation:
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS],
 
+    # semantic segmentation result for single sample
+    #   {
+    #       "masks": [np.array # 2D array containing only 0, 255]
+    #   }
+    Tasks.semantic_segmentation: [OutputKeys.MASKS],
+
     # image matting result for single sample
     # {
     #   "output_img": np.array with shape(h, w, 4)
diff --git a/modelscope/pipelines/cv/image_salient_detection_pipeline.py b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
index 433275ba..3b145cf0 100644
--- a/modelscope/pipelines/cv/image_salient_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
@@ -9,7 +9,7 @@ from modelscope.utils.constant import Tasks
 
 
 @PIPELINES.register_module(
-    Tasks.image_segmentation, module_name=Pipelines.salient_detection)
+    Tasks.semantic_segmentation, module_name=Pipelines.salient_detection)
 class ImageSalientDetectionPipeline(Pipeline):
 
     def __init__(self, model: str, **kwargs):
@@ -39,9 +39,5 @@ class ImageSalientDetectionPipeline(Pipeline):
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
 
         data = self.model.postprocess(inputs)
-        outputs = {
-            OutputKeys.SCORES: None,
-            OutputKeys.LABELS: None,
-            OutputKeys.MASKS: data
-        }
+        outputs = {OutputKeys.MASKS: data}
         return outputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 4c5d2f41..de3d933f 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -38,6 +38,7 @@ class CVTasks(object):
     image_object_detection = 'image-object-detection'
 
     image_segmentation = 'image-segmentation'
+    semantic_segmentation = 'semantic-segmentation'
     portrait_matting = 'portrait-matting'
     text_driven_segmentation = 'text-driven-segmentation'
     shop_segmentation = 'shop-segmentation'
diff --git a/tests/pipelines/test_salient_detection.py b/tests/pipelines/test_salient_detection.py
index e87e9388..bcb904e6 100644
--- a/tests/pipelines/test_salient_detection.py
+++ b/tests/pipelines/test_salient_detection.py
@@ -11,17 +11,16 @@ from modelscope.utils.test_utils import test_level
 class SalientDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.task = Tasks.image_segmentation
+        self.task = Tasks.semantic_segmentation
         self.model_id = 'damo/cv_u2net_salient-detection'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_salient_detection(self):
         input_location = 'data/test/images/image_salient_detection.jpg'
         model_id = 'damo/cv_u2net_salient-detection'
-        salient_detect = pipeline(Tasks.image_segmentation, model=model_id)
+        salient_detect = pipeline(Tasks.semantic_segmentation, model=model_id)
         result = salient_detect(input_location)
         import cv2
-        # result[OutputKeys.MASKS] is salient map result,other keys are not used
         cv2.imwrite(input_location + '_salient.jpg', result[OutputKeys.MASKS])
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')

From e90ff9e4795129eb8d64a2c4b67b3833217c7e1b Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Tue, 27 Sep 2022 22:09:30 +0800
Subject: [PATCH 11/23] [to #42322933] tts sambert am changs from tensorfow to
 PyTorch and add licenses

    * [to #41669377] docs and tools refinement and release

1. add build_doc linter script
2. add sphinx-docs support
3. add development doc and api doc
4. change version to 0.1.0 for the first internal release version

Link: https://code.aone.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8775307
---
 .../models/audio/tts/models/__init__.py       |    9 -
 .../models/audio/tts/models/am_models.py      |  460 -------
 modelscope/models/audio/tts/models/compat.py  |   82 --
 .../tts/{text => models/datasets}/__init__.py |    0
 .../tts/models/datasets/kantts_data4fs.py     |  238 ++++
 .../audio/tts/models/datasets/samplers.py     |  131 ++
 .../tts/models/datasets/units/__init__.py     |    3 +
 .../tts/models/datasets/units/cleaners.py     |   88 ++
 .../tts/models/datasets/units/ling_unit.py    |  395 ++++++
 .../datasets/units}/numbers.py                |    3 +
 modelscope/models/audio/tts/models/fsmn.py    |  273 ----
 .../models/audio/tts/models/fsmn_encoder.py   |  178 ---
 modelscope/models/audio/tts/models/helpers.py |  159 ---
 .../audio/tts/models/models/__init__.py       |    0
 .../tts/models/models/hifigan/__init__.py     |    3 +
 .../tts/models/models/hifigan/hifigan.py      |  238 ++++
 .../tts/models/models/sambert/__init__.py     |    3 +
 .../tts/models/models/sambert/adaptors.py     |  131 ++
 .../audio/tts/models/models/sambert/base.py   |  369 ++++++
 .../audio/tts/models/models/sambert/fsmn.py   |  126 ++
 .../models/models/sambert/kantts_sambert.py   |  718 ++++++++++
 .../tts/models/models/sambert/positions.py    |  101 ++
 .../models/audio/tts/models/position.py       |  174 ---
 modelscope/models/audio/tts/models/reducer.py |  155 ---
 .../models/audio/tts/models/rnn_wrappers.py   |  237 ----
 .../models/audio/tts/models/robutrans.py      |  760 -----------
 .../tts/models/self_attention_decoder.py      |  817 ------------
 .../tts/models/self_attention_encoder.py      |  182 ---
 .../models/audio/tts/models/transformer.py    | 1157 -----------------
 modelscope/models/audio/tts/models/utils.py   |   59 -
 .../models/audio/tts/models/utils/__init__.py |    3 +
 .../models/audio/tts/models/utils/utils.py    |  136 ++
 .../models/audio/tts/models/vocoder_models.py |  516 --------
 modelscope/models/audio/tts/sambert_hifi.py   |   34 +-
 modelscope/models/audio/tts/text/cleaners.py  |   89 --
 modelscope/models/audio/tts/text/cmudict.py   |   64 -
 modelscope/models/audio/tts/text/symbols.py   |  105 --
 .../models/audio/tts/text/symbols_dict.py     |  200 ---
 modelscope/models/audio/tts/voice.py          |  333 ++---
 .../audio/text_to_speech_pipeline.py          |    5 +
 modelscope/utils/audio/tts_exceptions.py      |    3 +-
 requirements/audio.txt                        |    5 -
 tests/pipelines/test_text_to_speech.py        |    5 +-
 43 files changed, 2799 insertions(+), 5948 deletions(-)
 mode change 100755 => 100644 modelscope/models/audio/tts/models/__init__.py
 delete mode 100755 modelscope/models/audio/tts/models/am_models.py
 delete mode 100755 modelscope/models/audio/tts/models/compat.py
 rename modelscope/models/audio/tts/{text => models/datasets}/__init__.py (100%)
 mode change 100755 => 100644
 create mode 100644 modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
 create mode 100644 modelscope/models/audio/tts/models/datasets/samplers.py
 create mode 100644 modelscope/models/audio/tts/models/datasets/units/__init__.py
 create mode 100644 modelscope/models/audio/tts/models/datasets/units/cleaners.py
 create mode 100644 modelscope/models/audio/tts/models/datasets/units/ling_unit.py
 rename modelscope/models/audio/tts/{text => models/datasets/units}/numbers.py (94%)
 mode change 100755 => 100644
 delete mode 100755 modelscope/models/audio/tts/models/fsmn.py
 delete mode 100755 modelscope/models/audio/tts/models/fsmn_encoder.py
 delete mode 100755 modelscope/models/audio/tts/models/helpers.py
 create mode 100644 modelscope/models/audio/tts/models/models/__init__.py
 create mode 100644 modelscope/models/audio/tts/models/models/hifigan/__init__.py
 create mode 100755 modelscope/models/audio/tts/models/models/hifigan/hifigan.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/__init__.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/adaptors.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/base.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/fsmn.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
 create mode 100644 modelscope/models/audio/tts/models/models/sambert/positions.py
 delete mode 100755 modelscope/models/audio/tts/models/position.py
 delete mode 100755 modelscope/models/audio/tts/models/reducer.py
 delete mode 100755 modelscope/models/audio/tts/models/rnn_wrappers.py
 delete mode 100755 modelscope/models/audio/tts/models/robutrans.py
 delete mode 100755 modelscope/models/audio/tts/models/self_attention_decoder.py
 delete mode 100755 modelscope/models/audio/tts/models/self_attention_encoder.py
 delete mode 100755 modelscope/models/audio/tts/models/transformer.py
 delete mode 100755 modelscope/models/audio/tts/models/utils.py
 create mode 100644 modelscope/models/audio/tts/models/utils/__init__.py
 create mode 100755 modelscope/models/audio/tts/models/utils/utils.py
 delete mode 100755 modelscope/models/audio/tts/models/vocoder_models.py
 delete mode 100755 modelscope/models/audio/tts/text/cleaners.py
 delete mode 100755 modelscope/models/audio/tts/text/cmudict.py
 delete mode 100644 modelscope/models/audio/tts/text/symbols.py
 delete mode 100644 modelscope/models/audio/tts/text/symbols_dict.py

diff --git a/modelscope/models/audio/tts/models/__init__.py b/modelscope/models/audio/tts/models/__init__.py
old mode 100755
new mode 100644
index c260d4fe..e69de29b
--- a/modelscope/models/audio/tts/models/__init__.py
+++ b/modelscope/models/audio/tts/models/__init__.py
@@ -1,9 +0,0 @@
-from .robutrans import RobuTrans
-from .vocoder_models import Generator
-
-
-def create_am_model(name, hparams):
-    if name == 'robutrans':
-        return RobuTrans(hparams)
-    else:
-        raise Exception('Unknown model: ' + name)
diff --git a/modelscope/models/audio/tts/models/am_models.py b/modelscope/models/audio/tts/models/am_models.py
deleted file mode 100755
index cd43ff12..00000000
--- a/modelscope/models/audio/tts/models/am_models.py
+++ /dev/null
@@ -1,460 +0,0 @@
-import tensorflow as tf
-
-
-def encoder_prenet(inputs,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   dense_units,
-                   is_training,
-                   mask=None,
-                   scope='encoder_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-        x = tf.layers.dense(
-            x, units=dense_units, activation=None, name='dense')
-    return x
-
-
-def decoder_prenet(inputs,
-                   prenet_units,
-                   dense_units,
-                   is_training,
-                   scope='decoder_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i, units in enumerate(prenet_units):
-            x = tf.layers.dense(
-                x,
-                units=units,
-                activation=tf.nn.relu,
-                name='dense_{}'.format(i))
-            x = tf.layers.dropout(
-                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
-        x = tf.layers.dense(
-            x, units=dense_units, activation=None, name='dense')
-    return x
-
-
-def encoder(inputs,
-            input_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker,
-            mask=None,
-            scope='encoder'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm(
-            inputs,
-            input_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker,
-            mask=mask)
-    return x
-
-
-def prenet(inputs, prenet_units, is_training, scope='prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i, units in enumerate(prenet_units):
-            x = tf.layers.dense(
-                x,
-                units=units,
-                activation=tf.nn.relu,
-                name='dense_{}'.format(i))
-            x = tf.layers.dropout(
-                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
-    return x
-
-
-def postnet_residual_ulstm(inputs,
-                           n_conv_layers,
-                           filters,
-                           kernel_size,
-                           lstm_units,
-                           output_units,
-                           is_training,
-                           scope='postnet_residual_ulstm'):
-    with tf.variable_scope(scope):
-        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
-                           lstm_units, is_training)
-        x = conv1d(
-            x,
-            output_units,
-            kernel_size,
-            is_training,
-            activation=None,
-            dropout=False,
-            scope='conv1d_{}'.format(n_conv_layers - 1))
-    return x
-
-
-def postnet_residual_lstm(inputs,
-                          n_conv_layers,
-                          filters,
-                          kernel_size,
-                          lstm_units,
-                          output_units,
-                          is_training,
-                          scope='postnet_residual_lstm'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size,
-                          lstm_units, is_training)
-        x = conv1d(
-            x,
-            output_units,
-            kernel_size,
-            is_training,
-            activation=None,
-            dropout=False,
-            scope='conv1d_{}'.format(n_conv_layers - 1))
-    return x
-
-
-def postnet_linear_ulstm(inputs,
-                         n_conv_layers,
-                         filters,
-                         kernel_size,
-                         lstm_units,
-                         output_units,
-                         is_training,
-                         scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
-                           lstm_units, is_training)
-        x = tf.layers.dense(x, units=output_units)
-    return x
-
-
-def postnet_linear_lstm(inputs,
-                        n_conv_layers,
-                        filters,
-                        kernel_size,
-                        lstm_units,
-                        output_units,
-                        output_lengths,
-                        is_training,
-                        embedded_inputs_speaker2,
-                        mask=None,
-                        scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm_dec(
-            inputs,
-            output_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker2,
-            mask=mask)
-        x = tf.layers.dense(x, units=output_units)
-    return x
-
-
-def postnet_linear(inputs,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   lstm_units,
-                   output_units,
-                   output_lengths,
-                   is_training,
-                   embedded_inputs_speaker2,
-                   mask=None,
-                   scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_dec(
-            inputs,
-            output_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker2,
-            mask=mask)
-    return x
-
-
-def conv_and_lstm(inputs,
-                  sequence_lengths,
-                  n_conv_layers,
-                  filters,
-                  kernel_size,
-                  lstm_units,
-                  is_training,
-                  embedded_inputs_speaker,
-                  mask=None,
-                  scope='conv_and_lstm'):
-    from tensorflow.contrib.rnn import LSTMBlockCell
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-
-    return x
-
-
-def conv_and_lstm_dec(inputs,
-                      sequence_lengths,
-                      n_conv_layers,
-                      filters,
-                      kernel_size,
-                      lstm_units,
-                      is_training,
-                      embedded_inputs_speaker2,
-                      mask=None,
-                      scope='conv_and_lstm'):
-    x = inputs
-    from tensorflow.contrib.rnn import LSTMBlockCell
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker2], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-    return x
-
-
-def conv_dec(inputs,
-             sequence_lengths,
-             n_conv_layers,
-             filters,
-             kernel_size,
-             lstm_units,
-             is_training,
-             embedded_inputs_speaker2,
-             mask=None,
-             scope='conv_and_lstm'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-        x = tf.concat([x, embedded_inputs_speaker2], axis=2)
-    return x
-
-
-def conv_and_ulstm(inputs,
-                   sequence_lengths,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   lstm_units,
-                   is_training,
-                   scope='conv_and_ulstm'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                scope='conv1d_{}'.format(i))
-
-        outputs, states = tf.nn.dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-
-    return outputs
-
-
-def conv1d(inputs,
-           filters,
-           kernel_size,
-           is_training,
-           activation=None,
-           dropout=False,
-           mask=None,
-           scope='conv1d'):
-    with tf.variable_scope(scope):
-        if mask is not None:
-            inputs = inputs * tf.expand_dims(mask, -1)
-        x = tf.layers.conv1d(
-            inputs, filters=filters, kernel_size=kernel_size, padding='same')
-        if mask is not None:
-            x = x * tf.expand_dims(mask, -1)
-
-        x = tf.layers.batch_normalization(x, training=is_training)
-        if activation is not None:
-            x = activation(x)
-        if dropout:
-            x = tf.layers.dropout(x, rate=0.5, training=is_training)
-    return x
-
-
-def conv1d_dp(inputs,
-              filters,
-              kernel_size,
-              is_training,
-              activation=None,
-              dropout=False,
-              dropoutrate=0.5,
-              mask=None,
-              scope='conv1d'):
-    with tf.variable_scope(scope):
-        if mask is not None:
-            inputs = inputs * tf.expand_dims(mask, -1)
-        x = tf.layers.conv1d(
-            inputs, filters=filters, kernel_size=kernel_size, padding='same')
-        if mask is not None:
-            x = x * tf.expand_dims(mask, -1)
-
-        x = tf.contrib.layers.layer_norm(x)
-        if activation is not None:
-            x = activation(x)
-        if dropout:
-            x = tf.layers.dropout(x, rate=dropoutrate, training=is_training)
-    return x
-
-
-def duration_predictor(inputs,
-                       n_conv_layers,
-                       filters,
-                       kernel_size,
-                       lstm_units,
-                       input_lengths,
-                       is_training,
-                       embedded_inputs_speaker,
-                       mask=None,
-                       scope='duration_predictor'):
-    with tf.variable_scope(scope):
-        x = inputs
-        for i in range(n_conv_layers):
-            x = conv1d_dp(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                dropoutrate=0.1,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=input_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-
-        x = tf.layers.dense(x, units=1)
-        x = tf.nn.relu(x)
-    return x
-
-
-def duration_predictor2(inputs,
-                        n_conv_layers,
-                        filters,
-                        kernel_size,
-                        input_lengths,
-                        is_training,
-                        mask=None,
-                        scope='duration_predictor'):
-    with tf.variable_scope(scope):
-        x = inputs
-        for i in range(n_conv_layers):
-            x = conv1d_dp(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                dropoutrate=0.1,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.layers.dense(x, units=1)
-        x = tf.nn.relu(x)
-    return x
-
-
-def conv_prenet(inputs,
-                n_conv_layers,
-                filters,
-                kernel_size,
-                is_training,
-                mask=None,
-                scope='conv_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-    return x
diff --git a/modelscope/models/audio/tts/models/compat.py b/modelscope/models/audio/tts/models/compat.py
deleted file mode 100755
index bb810841..00000000
--- a/modelscope/models/audio/tts/models/compat.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""Functions for compatibility with different TensorFlow versions."""
-
-import tensorflow as tf
-
-
-def is_tf2():
-    """Returns ``True`` if running TensorFlow 2.0."""
-    return tf.__version__.startswith('2')
-
-
-def tf_supports(symbol):
-    """Returns ``True`` if TensorFlow defines :obj:`symbol`."""
-    return _string_to_tf_symbol(symbol) is not None
-
-
-def tf_any(*symbols):
-    """Returns the first supported symbol."""
-    for symbol in symbols:
-        module = _string_to_tf_symbol(symbol)
-        if module is not None:
-            return module
-    return None
-
-
-def tf_compat(v2=None, v1=None):  # pylint: disable=invalid-name
-    """Returns the compatible symbol based on the current TensorFlow version.
-
-    Args:
-      v2: The candidate v2 symbol name.
-      v1: The candidate v1 symbol name.
-
-    Returns:
-      A TensorFlow symbol.
-
-    Raises:
-      ValueError: if no symbol can be found.
-    """
-    candidates = []
-    if v2 is not None:
-        candidates.append(v2)
-    if v1 is not None:
-        candidates.append(v1)
-        candidates.append('compat.v1.%s' % v1)
-    symbol = tf_any(*candidates)
-    if symbol is None:
-        raise ValueError('Failure to resolve the TensorFlow symbol')
-    return symbol
-
-
-def name_from_variable_scope(name=''):
-    """Creates a name prefixed by the current variable scope."""
-    var_scope = tf_compat(v1='get_variable_scope')().name
-    compat_name = ''
-    if name:
-        compat_name = '%s/' % name
-    if var_scope:
-        compat_name = '%s/%s' % (var_scope, compat_name)
-    return compat_name
-
-
-def reuse():
-    """Returns ``True`` if the current variable scope is marked for reuse."""
-    return tf_compat(v1='get_variable_scope')().reuse
-
-
-def _string_to_tf_symbol(symbol):
-    modules = symbol.split('.')
-    namespace = tf
-    for module in modules:
-        namespace = getattr(namespace, module, None)
-        if namespace is None:
-            return None
-    return namespace
-
-
-# pylint: disable=invalid-name
-gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy')
-gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists')
-gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile')
-is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor')
-logging = tf_compat(v1='logging')
-nest = tf_compat(v2='nest', v1='contrib.framework.nest')
diff --git a/modelscope/models/audio/tts/text/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100755
new mode 100644
similarity index 100%
rename from modelscope/models/audio/tts/text/__init__.py
rename to modelscope/models/audio/tts/models/datasets/__init__.py
diff --git a/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
new file mode 100644
index 00000000..cc47d0c4
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
@@ -0,0 +1,238 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+
+import json
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from modelscope.utils.logger import get_logger
+from .units import KanTtsLinguisticUnit
+
+logger = get_logger()
+
+
+class KanTtsText2MelDataset(Dataset):
+
+    def __init__(self, metadata_filename, config_filename, cache=False):
+        super(KanTtsText2MelDataset, self).__init__()
+
+        self.cache = cache
+
+        with open(config_filename) as f:
+            self._config = json.loads(f.read())
+
+        # Load metadata:
+        self._datadir = os.path.dirname(metadata_filename)
+        with open(metadata_filename, encoding='utf-8') as f:
+            self._metadata = [line.strip().split('|') for line in f]
+            self._length_lst = [int(x[2]) for x in self._metadata]
+            hours = sum(
+                self._length_lst) * self._config['audio']['frame_shift_ms'] / (
+                    3600 * 1000)
+
+            logger.info('Loaded metadata for %d examples (%.2f hours)' %
+                        (len(self._metadata), hours))
+            logger.info('Minimum length: %d, Maximum length: %d' %
+                        (min(self._length_lst), max(self._length_lst)))
+
+        self.ling_unit = KanTtsLinguisticUnit(config_filename)
+        self.pad_executor = KanTtsText2MelPad()
+
+        self.r = self._config['am']['outputs_per_step']
+        self.num_mels = self._config['am']['num_mels']
+
+        if 'adv' in self._config:
+            self.feat_window = self._config['adv']['random_window']
+        else:
+            self.feat_window = None
+        logger.info(self.feat_window)
+
+        self.data_cache = [
+            self.cache_load(i) for i in tqdm(range(self.__len__()))
+        ] if self.cache else []
+
+    def get_frames_lst(self):
+        return self._length_lst
+
+    def __getitem__(self, index):
+        if self.cache:
+            sample = self.data_cache[index]
+            return sample
+
+        return self.cache_load(index)
+
+    def cache_load(self, index):
+        sample = {}
+
+        meta = self._metadata[index]
+
+        sample['utt_id'] = meta[0]
+
+        sample['mel_target'] = np.load(os.path.join(
+            self._datadir, meta[1]))[:, :self.num_mels]
+        sample['output_length'] = len(sample['mel_target'])
+
+        lfeat_symbol = meta[3]
+        sample['ling'] = self.ling_unit.encode_symbol_sequence(lfeat_symbol)
+
+        sample['duration'] = np.load(os.path.join(self._datadir, meta[4]))
+
+        sample['pitch_contour'] = np.load(os.path.join(self._datadir, meta[5]))
+
+        sample['energy_contour'] = np.load(
+            os.path.join(self._datadir, meta[6]))
+
+        return sample
+
+    def __len__(self):
+        return len(self._metadata)
+
+    def collate_fn(self, batch):
+        data_dict = {}
+
+        max_input_length = max((len(x['ling'][0]) for x in batch))
+
+        # pure linguistic info: sy|tone|syllable_flag|word_segment
+
+        # sy
+        lfeat_type = self.ling_unit._lfeat_type_list[0]
+        inputs_sy = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][0] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+        # tone
+        lfeat_type = self.ling_unit._lfeat_type_list[1]
+        inputs_tone = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][1] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # syllable_flag
+        lfeat_type = self.ling_unit._lfeat_type_list[2]
+        inputs_syllable_flag = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][2] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # word_segment
+        lfeat_type = self.ling_unit._lfeat_type_list[3]
+        inputs_ws = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][3] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # emotion category
+        lfeat_type = self.ling_unit._lfeat_type_list[4]
+        data_dict['input_emotions'] = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][4] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # speaker category
+        lfeat_type = self.ling_unit._lfeat_type_list[5]
+        data_dict['input_speakers'] = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][5] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        data_dict['input_lings'] = torch.stack(
+            [inputs_sy, inputs_tone, inputs_syllable_flag, inputs_ws], dim=2)
+
+        data_dict['valid_input_lengths'] = torch.as_tensor(
+            [len(x['ling'][0]) - 1 for x in batch], dtype=torch.long
+        )  # There is one '~' in the last of symbol sequence. We put length-1 for calculation.
+
+        data_dict['valid_output_lengths'] = torch.as_tensor(
+            [x['output_length'] for x in batch], dtype=torch.long)
+        max_output_length = torch.max(data_dict['valid_output_lengths']).item()
+        max_output_round_length = self.pad_executor._round_up(
+            max_output_length, self.r)
+
+        if self.feat_window is not None:
+            active_feat_len = np.minimum(max_output_round_length,
+                                         self.feat_window)
+            if active_feat_len < self.feat_window:
+                max_output_round_length = self.pad_executor._round_up(
+                    self.feat_window, self.r)
+                active_feat_len = self.feat_window
+
+            max_offsets = [x['output_length'] - active_feat_len for x in batch]
+            feat_offsets = [
+                np.random.randint(0, np.maximum(1, offset))
+                for offset in max_offsets
+            ]
+            feat_offsets = torch.from_numpy(
+                np.asarray(feat_offsets, dtype=np.int32)).long()
+            data_dict['feat_offsets'] = feat_offsets
+
+        data_dict['mel_targets'] = self.pad_executor._prepare_targets(
+            [x['mel_target'] for x in batch], max_output_round_length, 0.0)
+        data_dict['durations'] = self.pad_executor._prepare_durations(
+            [x['duration'] for x in batch], max_input_length,
+            max_output_round_length)
+
+        data_dict['pitch_contours'] = self.pad_executor._prepare_scalar_inputs(
+            [x['pitch_contour'] for x in batch], max_input_length,
+            0.0).float()
+        data_dict[
+            'energy_contours'] = self.pad_executor._prepare_scalar_inputs(
+                [x['energy_contour'] for x in batch], max_input_length,
+                0.0).float()
+
+        data_dict['utt_ids'] = [x['utt_id'] for x in batch]
+
+        return data_dict
+
+
+class KanTtsText2MelPad(object):
+
+    def __init__(self):
+        super(KanTtsText2MelPad, self).__init__()
+        pass
+
+    def _pad1D(self, x, length, pad):
+        return np.pad(
+            x, (0, length - x.shape[0]), mode='constant', constant_values=pad)
+
+    def _pad2D(self, x, length, pad):
+        return np.pad(
+            x, [(0, length - x.shape[0]), (0, 0)],
+            mode='constant',
+            constant_values=pad)
+
+    def _pad_durations(self, duration, max_in_len, max_out_len):
+        framenum = np.sum(duration)
+        symbolnum = duration.shape[0]
+        if framenum < max_out_len:
+            padframenum = max_out_len - framenum
+            duration = np.insert(
+                duration, symbolnum, values=padframenum, axis=0)
+            duration = np.insert(
+                duration,
+                symbolnum + 1,
+                values=[0] * (max_in_len - symbolnum - 1),
+                axis=0)
+        else:
+            if symbolnum < max_in_len:
+                duration = np.insert(
+                    duration,
+                    symbolnum,
+                    values=[0] * (max_in_len - symbolnum),
+                    axis=0)
+        return duration
+
+    def _round_up(self, x, multiple):
+        remainder = x % multiple
+        return x if remainder == 0 else x + multiple - remainder
+
+    def _prepare_scalar_inputs(self, inputs, max_len, pad):
+        return torch.from_numpy(
+            np.stack([self._pad1D(x, max_len, pad) for x in inputs]))
+
+    def _prepare_targets(self, targets, max_len, pad):
+        return torch.from_numpy(
+            np.stack([self._pad2D(t, max_len, pad) for t in targets])).float()
+
+    def _prepare_durations(self, durations, max_in_len, max_out_len):
+        return torch.from_numpy(
+            np.stack([
+                self._pad_durations(t, max_in_len, max_out_len)
+                for t in durations
+            ])).long()
diff --git a/modelscope/models/audio/tts/models/datasets/samplers.py b/modelscope/models/audio/tts/models/datasets/samplers.py
new file mode 100644
index 00000000..0657fa8a
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/samplers.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import random
+
+import torch
+from torch import distributed as dist
+from torch.utils.data import Sampler
+
+
+class LenSortGroupPoolSampler(Sampler):
+
+    def __init__(self, data_source, length_lst, group_size):
+        super(LenSortGroupPoolSampler, self).__init__(data_source)
+
+        self.data_source = data_source
+        self.length_lst = length_lst
+        self.group_size = group_size
+
+        self.num = len(self.length_lst)
+        self.buckets = self.num // group_size
+
+    def __iter__(self):
+
+        def getkey(item):
+            return item[1]
+
+        random_lst = torch.randperm(self.num).tolist()
+        random_len_lst = [(i, self.length_lst[i]) for i in random_lst]
+
+        # Bucket examples based on similar output sequence length for efficiency:
+        groups = [
+            random_len_lst[i:i + self.group_size]
+            for i in range(0, self.num, self.group_size)
+        ]
+        if (self.num % self.group_size):
+            groups.append(random_len_lst[self.buckets * self.group_size:-1])
+
+        indices = []
+
+        for group in groups:
+            group.sort(key=getkey, reverse=True)
+            for item in group:
+                indices.append(item[0])
+
+        return iter(indices)
+
+    def __len__(self):
+        return len(self.data_source)
+
+
+class DistributedLenSortGroupPoolSampler(Sampler):
+
+    def __init__(self,
+                 dataset,
+                 length_lst,
+                 group_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True):
+        super(DistributedLenSortGroupPoolSampler, self).__init__(dataset)
+
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'modelscope error: Requires distributed package to be available'
+                )
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'modelscope error: Requires distributed package to be available'
+                )
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.length_lst = length_lst
+        self.group_size = group_size
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.buckets = self.num_samples // group_size
+        self.shuffle = shuffle
+
+    def __iter__(self):
+
+        def getkey(item):
+            return item[1]
+
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        random_len_lst = [(i, self.length_lst[i]) for i in indices]
+
+        # Bucket examples based on similar output sequence length for efficiency:
+        groups = [
+            random_len_lst[i:i + self.group_size]
+            for i in range(0, self.num_samples, self.group_size)
+        ]
+        if (self.num_samples % self.group_size):
+            groups.append(random_len_lst[self.buckets * self.group_size:-1])
+
+        new_indices = []
+
+        for group in groups:
+            group.sort(key=getkey, reverse=True)
+            for item in group:
+                new_indices.append(item[0])
+
+        return iter(new_indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/modelscope/models/audio/tts/models/datasets/units/__init__.py b/modelscope/models/audio/tts/models/datasets/units/__init__.py
new file mode 100644
index 00000000..4d03df04
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/units/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .ling_unit import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/datasets/units/cleaners.py b/modelscope/models/audio/tts/models/datasets/units/cleaners.py
new file mode 100644
index 00000000..07d4fbdb
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/units/cleaners.py
@@ -0,0 +1,88 @@
+# from https://github.com/keithito/tacotron
+# Cleaners are transformations that run over the input text at both training and eval time.
+#
+# Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+# hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+#   1. "english_cleaners" for English text
+#   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+#      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+#   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+#      the symbols in symbols.py to match your data).
+
+import re
+
+from unidecode import unidecode
+
+from .numbers import normalize_numbers
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+    for x in [('mrs', 'misess'),
+              ('mr', 'mister'),
+              ('dr', 'doctor'),
+              ('st', 'saint'),
+              ('co', 'company'),
+              ('jr', 'junior'),
+              ('maj', 'major'),
+              ('gen', 'general'),
+              ('drs', 'doctors'),
+              ('rev', 'reverend'),
+              ('lt', 'lieutenant'),
+              ('hon', 'honorable'),
+              ('sgt', 'sergeant'),
+              ('capt', 'captain'),
+              ('esq', 'esquire'),
+              ('ltd', 'limited'),
+              ('col', 'colonel'),
+              ('ft', 'fort'), ]]  # yapf:disable
+
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def expand_numbers(text):
+    return normalize_numbers(text)
+
+
+def lowercase(text):
+    return text.lower()
+
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+
+
+def convert_to_ascii(text):
+    return unidecode(text)
+
+
+def basic_cleaners(text):
+    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def transliteration_cleaners(text):
+    '''Pipeline for non-English text that transliterates to ASCII.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def english_cleaners(text):
+    '''Pipeline for English text, including number and abbreviation expansion.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text
diff --git a/modelscope/models/audio/tts/models/datasets/units/ling_unit.py b/modelscope/models/audio/tts/models/datasets/units/ling_unit.py
new file mode 100644
index 00000000..3c211cc7
--- /dev/null
+++ b/modelscope/models/audio/tts/models/datasets/units/ling_unit.py
@@ -0,0 +1,395 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import abc
+import codecs
+import os
+import re
+import shutil
+
+import json
+import numpy as np
+
+from . import cleaners as cleaners
+
+# Regular expression matching text enclosed in curly braces:
+_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
+
+
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception(
+                'modelscope error: configuration cleaner unknown: %s' % name)
+        text = cleaner(text)
+    return text
+
+
+class LinguisticBaseUnit(abc.ABC):
+
+    def set_config_params(self, config_params):
+        self.config_params = config_params
+
+    def save(self, config, config_name, path):
+        t_path = os.path.join(path, config_name)
+        if config != t_path:
+            os.makedirs(path, exist_ok=True)
+            shutil.copyfile(config, os.path.join(path, config_name))
+
+
+class KanTtsLinguisticUnit(LinguisticBaseUnit):
+
+    def __init__(self, config, path, has_mask=True):
+        super(KanTtsLinguisticUnit, self).__init__()
+
+        # special symbol
+        self._pad = '_'
+        self._eos = '~'
+        self._mask = '@[MASK]'
+        self._has_mask = has_mask
+        self._unit_config = config
+        self._path = path
+
+        self._cleaner_names = [
+            x.strip() for x in self._unit_config['cleaners'].split(',')
+        ]
+        self._lfeat_type_list = self._unit_config['lfeat_type_list'].strip(
+        ).split(',')
+
+        self.build()
+
+    def get_unit_size(self):
+        ling_unit_size = {}
+        ling_unit_size['sy'] = len(self.sy)
+        ling_unit_size['tone'] = len(self.tone)
+        ling_unit_size['syllable_flag'] = len(self.syllable_flag)
+        ling_unit_size['word_segment'] = len(self.word_segment)
+
+        if 'emo_category' in self._lfeat_type_list:
+            ling_unit_size['emotion'] = len(self.emo_category)
+        if 'speaker_category' in self._lfeat_type_list:
+            ling_unit_size['speaker'] = len(self.speaker)
+
+        return ling_unit_size
+
+    def build(self):
+
+        self._sub_unit_dim = {}
+        self._sub_unit_pad = {}
+        # sy sub-unit
+        _characters = ''
+
+        _ch_symbols = []
+
+        sy_path = os.path.join(self._path, self._unit_config['sy'])
+        f = codecs.open(sy_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_symbols.append(line)
+
+        _arpabet = ['@' + s for s in _ch_symbols]
+
+        # Export all symbols:
+        self.sy = list(_characters) + _arpabet + [self._pad, self._eos]
+        if self._has_mask:
+            self.sy.append(self._mask)
+        self._sy_to_id = {s: i for i, s in enumerate(self.sy)}
+        self._id_to_sy = {i: s for i, s in enumerate(self.sy)}
+        self._sub_unit_dim['sy'] = len(self.sy)
+        self._sub_unit_pad['sy'] = self._sy_to_id['_']
+
+        # tone sub-unit
+        _characters = ''
+
+        _ch_tones = []
+
+        tone_path = os.path.join(self._path, self._unit_config['tone'])
+        f = codecs.open(tone_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_tones.append(line)
+
+        # Export all tones:
+        self.tone = list(_characters) + _ch_tones + [self._pad, self._eos]
+        if self._has_mask:
+            self.tone.append(self._mask)
+        self._tone_to_id = {s: i for i, s in enumerate(self.tone)}
+        self._id_to_tone = {i: s for i, s in enumerate(self.tone)}
+        self._sub_unit_dim['tone'] = len(self.tone)
+        self._sub_unit_pad['tone'] = self._tone_to_id['_']
+
+        # syllable flag sub-unit
+        _characters = ''
+
+        _ch_syllable_flags = []
+
+        sy_flag_path = os.path.join(self._path,
+                                    self._unit_config['syllable_flag'])
+        f = codecs.open(sy_flag_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_syllable_flags.append(line)
+
+        # Export all syllable_flags:
+        self.syllable_flag = list(_characters) + _ch_syllable_flags + [
+            self._pad, self._eos
+        ]
+        if self._has_mask:
+            self.syllable_flag.append(self._mask)
+        self._syllable_flag_to_id = {
+            s: i
+            for i, s in enumerate(self.syllable_flag)
+        }
+        self._id_to_syllable_flag = {
+            i: s
+            for i, s in enumerate(self.syllable_flag)
+        }
+        self._sub_unit_dim['syllable_flag'] = len(self.syllable_flag)
+        self._sub_unit_pad['syllable_flag'] = self._syllable_flag_to_id['_']
+
+        # word segment sub-unit
+        _characters = ''
+
+        _ch_word_segments = []
+
+        ws_path = os.path.join(self._path, self._unit_config['word_segment'])
+        f = codecs.open(ws_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_word_segments.append(line)
+
+        # Export all syllable_flags:
+        self.word_segment = list(_characters) + _ch_word_segments + [
+            self._pad, self._eos
+        ]
+        if self._has_mask:
+            self.word_segment.append(self._mask)
+        self._word_segment_to_id = {
+            s: i
+            for i, s in enumerate(self.word_segment)
+        }
+        self._id_to_word_segment = {
+            i: s
+            for i, s in enumerate(self.word_segment)
+        }
+        self._sub_unit_dim['word_segment'] = len(self.word_segment)
+        self._sub_unit_pad['word_segment'] = self._word_segment_to_id['_']
+
+        if 'emo_category' in self._lfeat_type_list:
+            # emotion category sub-unit
+            _characters = ''
+
+            _ch_emo_types = []
+
+            emo_path = os.path.join(self._path,
+                                    self._unit_config['emo_category'])
+            f = codecs.open(emo_path, 'r')
+            for line in f:
+                line = line.strip('\r\n')
+                _ch_emo_types.append(line)
+
+            self.emo_category = list(_characters) + _ch_emo_types + [
+                self._pad, self._eos
+            ]
+            if self._has_mask:
+                self.emo_category.append(self._mask)
+            self._emo_category_to_id = {
+                s: i
+                for i, s in enumerate(self.emo_category)
+            }
+            self._id_to_emo_category = {
+                i: s
+                for i, s in enumerate(self.emo_category)
+            }
+            self._sub_unit_dim['emo_category'] = len(self.emo_category)
+            self._sub_unit_pad['emo_category'] = self._emo_category_to_id['_']
+
+        if 'speaker_category' in self._lfeat_type_list:
+            # speaker category sub-unit
+            _characters = ''
+
+            _ch_speakers = []
+
+            speaker_path = os.path.join(self._path,
+                                        self._unit_config['speaker_category'])
+            f = codecs.open(speaker_path, 'r')
+            for line in f:
+                line = line.strip('\r\n')
+                _ch_speakers.append(line)
+
+            # Export all syllable_flags:
+            self.speaker = list(_characters) + _ch_speakers + [
+                self._pad, self._eos
+            ]
+            if self._has_mask:
+                self.speaker.append(self._mask)
+            self._speaker_to_id = {s: i for i, s in enumerate(self.speaker)}
+            self._id_to_speaker = {i: s for i, s in enumerate(self.speaker)}
+            self._sub_unit_dim['speaker_category'] = len(self._speaker_to_id)
+            self._sub_unit_pad['speaker_category'] = self._speaker_to_id['_']
+
+    def encode_symbol_sequence(self, lfeat_symbol):
+        lfeat_symbol = lfeat_symbol.strip().split(' ')
+
+        lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list))
+        for this_lfeat_symbol in lfeat_symbol:
+            this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
+                '$')
+            index = 0
+            while index < len(lfeat_symbol_separate):
+                lfeat_symbol_separate[index] = lfeat_symbol_separate[
+                    index] + this_lfeat_symbol[index] + ' '
+                index = index + 1
+
+        input_and_label_data = []
+        index = 0
+        while index < len(self._lfeat_type_list):
+            sequence = self.encode_sub_unit(
+                lfeat_symbol_separate[index].strip(),
+                self._lfeat_type_list[index])
+            sequence_array = np.asarray(sequence, dtype=np.int32)
+            input_and_label_data.append(sequence_array)
+            index = index + 1
+
+        return input_and_label_data
+
+    def decode_symbol_sequence(self, sequence):
+        result = []
+        for i, lfeat_type in enumerate(self._lfeat_type_list):
+            s = ''
+            sequence_item = sequence[i].tolist()
+            if lfeat_type == 'sy':
+                s = self.decode_sy(sequence_item)
+            elif lfeat_type == 'tone':
+                s = self.decode_tone(sequence_item)
+            elif lfeat_type == 'syllable_flag':
+                s = self.decode_syllable_flag(sequence_item)
+            elif lfeat_type == 'word_segment':
+                s = self.decode_word_segment(sequence_item)
+            elif lfeat_type == 'emo_category':
+                s = self.decode_emo_category(sequence_item)
+            elif lfeat_type == 'speaker_category':
+                s = self.decode_speaker_category(sequence_item)
+            else:
+                raise Exception(
+                    'modelscope error: configuration lfeat type(%s) unknown.'
+                    % lfeat_type)
+            result.append('%s:%s' % (lfeat_type, s))
+
+        return result
+
+    def encode_sub_unit(self, this_lfeat_symbol, lfeat_type):
+        sequence = []
+        if lfeat_type == 'sy':
+            this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
+            this_lfeat_symbol_format = ''
+            index = 0
+            while index < len(this_lfeat_symbol):
+                this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
+                    index] + '}' + ' '
+                index = index + 1
+            sequence = self.encode_text(this_lfeat_symbol_format,
+                                        self._cleaner_names)
+        elif lfeat_type == 'tone':
+            sequence = self.encode_tone(this_lfeat_symbol)
+        elif lfeat_type == 'syllable_flag':
+            sequence = self.encode_syllable_flag(this_lfeat_symbol)
+        elif lfeat_type == 'word_segment':
+            sequence = self.encode_word_segment(this_lfeat_symbol)
+        elif lfeat_type == 'emo_category':
+            sequence = self.encode_emo_category(this_lfeat_symbol)
+        elif lfeat_type == 'speaker_category':
+            sequence = self.encode_speaker_category(this_lfeat_symbol)
+        else:
+            raise Exception(
+                'modelscope error: configuration lfeat type(%s) unknown.'
+                % lfeat_type)
+
+        return sequence
+
+    def encode_text(self, text, cleaner_names):
+        sequence = []
+
+        # Check for curly braces and treat their contents as ARPAbet:
+        while len(text):
+            m = _curly_re.match(text)
+            if not m:
+                sequence += self.encode_sy(_clean_text(text, cleaner_names))
+                break
+            sequence += self.encode_sy(_clean_text(m.group(1), cleaner_names))
+            sequence += self.encode_arpanet(m.group(2))
+            text = m.group(3)
+
+        # Append EOS token
+        sequence.append(self._sy_to_id['~'])
+        return sequence
+
+    def encode_sy(self, sy):
+        return [self._sy_to_id[s] for s in sy if self.should_keep_sy(s)]
+
+    def decode_sy(self, id):
+        s = self._id_to_sy[id]
+        if len(s) > 1 and s[0] == '@':
+            s = s[1:]
+        return s
+
+    def should_keep_sy(self, s):
+        return s in self._sy_to_id and s != '_' and s != '~'
+
+    def encode_arpanet(self, text):
+        return self.encode_sy(['@' + s for s in text.split()])
+
+    def encode_tone(self, tone):
+        tones = tone.strip().split(' ')
+        sequence = []
+        for this_tone in tones:
+            sequence.append(self._tone_to_id[this_tone])
+        sequence.append(self._tone_to_id['~'])
+        return sequence
+
+    def decode_tone(self, id):
+        return self._id_to_tone[id]
+
+    def encode_syllable_flag(self, syllable_flag):
+        syllable_flags = syllable_flag.strip().split(' ')
+        sequence = []
+        for this_syllable_flag in syllable_flags:
+            sequence.append(self._syllable_flag_to_id[this_syllable_flag])
+        sequence.append(self._syllable_flag_to_id['~'])
+        return sequence
+
+    def decode_syllable_flag(self, id):
+        return self._id_to_syllable_flag[id]
+
+    def encode_word_segment(self, word_segment):
+        word_segments = word_segment.strip().split(' ')
+        sequence = []
+        for this_word_segment in word_segments:
+            sequence.append(self._word_segment_to_id[this_word_segment])
+        sequence.append(self._word_segment_to_id['~'])
+        return sequence
+
+    def decode_word_segment(self, id):
+        return self._id_to_word_segment[id]
+
+    def encode_emo_category(self, emo_type):
+        emo_categories = emo_type.strip().split(' ')
+        sequence = []
+        for this_category in emo_categories:
+            sequence.append(self._emo_category_to_id[this_category])
+        sequence.append(self._emo_category_to_id['~'])
+        return sequence
+
+    def decode_emo_category(self, id):
+        return self._id_to_emo_category[id]
+
+    def encode_speaker_category(self, speaker):
+        speakers = speaker.strip().split(' ')
+        sequence = []
+        for this_speaker in speakers:
+            sequence.append(self._speaker_to_id[this_speaker])
+        sequence.append(self._speaker_to_id['~'])
+        return sequence
+
+    def decode_speaker_category(self, id):
+        return self._id_to_speaker[id]
diff --git a/modelscope/models/audio/tts/text/numbers.py b/modelscope/models/audio/tts/models/datasets/units/numbers.py
old mode 100755
new mode 100644
similarity index 94%
rename from modelscope/models/audio/tts/text/numbers.py
rename to modelscope/models/audio/tts/models/datasets/units/numbers.py
index d9453fee..d8835059
--- a/modelscope/models/audio/tts/text/numbers.py
+++ b/modelscope/models/audio/tts/models/datasets/units/numbers.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from tacotron,
+# made publicly available under the MIT License at https://github.com/keithito/tacotron
+
 import re
 
 import inflect
diff --git a/modelscope/models/audio/tts/models/fsmn.py b/modelscope/models/audio/tts/models/fsmn.py
deleted file mode 100755
index 875c27f0..00000000
--- a/modelscope/models/audio/tts/models/fsmn.py
+++ /dev/null
@@ -1,273 +0,0 @@
-import tensorflow as tf
-
-
-def build_sequence_mask(sequence_length,
-                        maximum_length=None,
-                        dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, max_length]``.
-    """
-    mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-
-    return mask
-
-
-def norm(inputs):
-    """Layer normalizes :obj:`inputs`."""
-    return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)
-
-
-def pad_in_time(x, padding_shape):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.
-
-       Agrs:
-        x: [Batch, Time, Frequency]
-        padding_length: padding size of constant value (0) before the time dimension
-
-      return:
-        padded x
-    """
-
-    depth = x.get_shape().as_list()[-1]
-    x = tf.pad(x, [[0, 0], padding_shape, [0, 0]])
-    x.set_shape((None, None, depth))
-
-    return x
-
-
-def pad_in_time_right(x, padding_length):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.
-
-       Agrs:
-        x: [Batch, Time, Frequency]
-        padding_length: padding size of constant value (0) before the time dimension
-
-      return:
-        padded x
-    """
-    depth = x.get_shape().as_list()[-1]
-    x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
-    x.set_shape((None, None, depth))
-
-    return x
-
-
-def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2
-
-    Args:
-      x: The input.
-      ffn_dim: The number of units of the nonlinear transformation.
-      memory_units: the number of units of linear transformation
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu)
-    inner = tf.layers.dropout(
-        inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN)
-    outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False)
-
-    return outer
-
-
-def drop_and_add(inputs, outputs, mode, dropout=0.0):
-    """Drops units in the outputs and adds the previous values.
-
-    Args:
-      inputs: The input of the previous layer.
-      outputs: The output of the previous layer.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units in :obj:`outputs`.
-
-    Returns:
-      The residual and normalized output.
-    """
-    outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)
-
-    input_dim = inputs.get_shape().as_list()[-1]
-    output_dim = outputs.get_shape().as_list()[-1]
-
-    if input_dim == output_dim:
-        outputs += inputs
-
-    return outputs
-
-
-def MemoryBlock(
-    inputs,
-    filter_size,
-    mode,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the bidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      mode: Training or Evaluation
-      mask: A ``tf.Tensor`` applied to the memory block output
-
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    inputs = tf.expand_dims(inputs, axis=1)  # [Batch, 1, Time, Frequency]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=inputs,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='SAME',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    output = tf.reshape(
-        output,
-        [tf.shape(output)[0], tf.shape(output)[2], depth])
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
-
-
-def MemoryBlockV2(
-    inputs,
-    filter_size,
-    mode,
-    shift=0,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the bidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      mode: Training or Evaluation
-      shift: left padding, to control delay
-      mask: A ``tf.Tensor`` applied to the memory block output
-
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    if mask is not None:
-        inputs = inputs * tf.expand_dims(mask, -1)
-
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    # padding
-    left_padding = int(round((filter_size - 1) / 2))
-    right_padding = int((filter_size - 1) / 2)
-    if shift > 0:
-        left_padding = left_padding + shift
-        right_padding = right_padding - shift
-    pad_inputs = pad_in_time(inputs, [left_padding, right_padding])
-    pad_inputs = tf.expand_dims(
-        pad_inputs, axis=1)  # [Batch, 1, Time, Frequency]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=pad_inputs,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='VALID',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = tf.reshape(
-        memory,
-        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
-
-
-def UniMemoryBlock(
-    inputs,
-    filter_size,
-    mode,
-    cache=None,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the unidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      cache: for streaming inference
-      mode: Training or Evaluation
-      mask: A ``tf.Tensor`` applied to the memory block output
-      dropout: dorpout factor
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    if cache is not None:
-        static_shape = cache['queries'].get_shape().as_list()
-        depth = static_shape[-1]
-        queries = tf.slice(cache['queries'], [0, 1, 0], [
-            tf.shape(cache['queries'])[0],
-            tf.shape(cache['queries'])[1] - 1, depth
-        ])
-        queries = tf.concat([queries, inputs], axis=1)
-        cache['queries'] = queries
-    else:
-        padding_length = filter_size - 1
-        queries = pad_in_time(inputs, [padding_length, 0])
-
-    queries = tf.expand_dims(queries, axis=1)  # [Batch, 1, Time, Frequency]
-    static_shape = queries.get_shape().as_list()
-    depth = static_shape[-1]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=queries,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='VALID',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = tf.reshape(
-        memory,
-        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
diff --git a/modelscope/models/audio/tts/models/fsmn_encoder.py b/modelscope/models/audio/tts/models/fsmn_encoder.py
deleted file mode 100755
index 2c650624..00000000
--- a/modelscope/models/audio/tts/models/fsmn_encoder.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import tensorflow as tf
-
-from . import fsmn
-
-
-class FsmnEncoder():
-    """Encoder using Fsmn
-    """
-
-    def __init__(self,
-                 filter_size,
-                 fsmn_num_layers,
-                 dnn_num_layers,
-                 num_memory_units=512,
-                 ffn_inner_dim=2048,
-                 dropout=0.0,
-                 position_encoder=None):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          filter_size: the total order of memory block
-          fsmn_num_layers: The number of fsmn layers.
-          dnn_num_layers: The number of dnn layers
-          num_units: The number of memory units.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(FsmnEncoder, self).__init__()
-        self.filter_size = filter_size
-        self.fsmn_num_layers = fsmn_num_layers
-        self.dnn_num_layers = dnn_num_layers
-        self.num_memory_units = num_memory_units
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        mask = fsmn.build_sequence_mask(
-            sequence_length, maximum_length=tf.shape(inputs)[1])
-
-        state = ()
-
-        for layer in range(self.fsmn_num_layers):
-            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
-                with tf.variable_scope('ffn'):
-                    context = fsmn.feed_forward(
-                        inputs,
-                        self.ffn_inner_dim,
-                        self.num_memory_units,
-                        mode,
-                        dropout=self.dropout)
-
-                with tf.variable_scope('memory'):
-                    memory = fsmn.MemoryBlock(
-                        context,
-                        self.filter_size,
-                        mode,
-                        mask=mask,
-                        dropout=self.dropout)
-
-                    memory = fsmn.drop_and_add(
-                        inputs, memory, mode, dropout=self.dropout)
-
-                inputs = memory
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        for layer in range(self.dnn_num_layers):
-            with tf.variable_scope('dnn_layer_{}'.format(layer)):
-                transformed = fsmn.feed_forward(
-                    inputs,
-                    self.ffn_inner_dim,
-                    self.num_memory_units,
-                    mode,
-                    dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = inputs
-        return (outputs, state, sequence_length)
-
-
-class FsmnEncoderV2():
-    """Encoder using Fsmn
-    """
-
-    def __init__(self,
-                 filter_size,
-                 fsmn_num_layers,
-                 dnn_num_layers,
-                 num_memory_units=512,
-                 ffn_inner_dim=2048,
-                 dropout=0.0,
-                 shift=0,
-                 position_encoder=None):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          filter_size: the total order of memory block
-          fsmn_num_layers: The number of fsmn layers.
-          dnn_num_layers: The number of dnn layers
-          num_units: The number of memory units.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          shift: left padding, to control delay
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(FsmnEncoderV2, self).__init__()
-        self.filter_size = filter_size
-        self.fsmn_num_layers = fsmn_num_layers
-        self.dnn_num_layers = dnn_num_layers
-        self.num_memory_units = num_memory_units
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.shift = shift
-        if not isinstance(shift, list):
-            self.shift = [shift for _ in range(self.fsmn_num_layers)]
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        mask = fsmn.build_sequence_mask(
-            sequence_length, maximum_length=tf.shape(inputs)[1])
-
-        state = ()
-        for layer in range(self.fsmn_num_layers):
-            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
-                with tf.variable_scope('ffn'):
-                    context = fsmn.feed_forward(
-                        inputs,
-                        self.ffn_inner_dim,
-                        self.num_memory_units,
-                        mode,
-                        dropout=self.dropout)
-
-                with tf.variable_scope('memory'):
-                    memory = fsmn.MemoryBlockV2(
-                        context,
-                        self.filter_size,
-                        mode,
-                        shift=self.shift[layer],
-                        mask=mask,
-                        dropout=self.dropout)
-
-                    memory = fsmn.drop_and_add(
-                        inputs, memory, mode, dropout=self.dropout)
-
-                inputs = memory
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        for layer in range(self.dnn_num_layers):
-            with tf.variable_scope('dnn_layer_{}'.format(layer)):
-                transformed = fsmn.feed_forward(
-                    inputs,
-                    self.ffn_inner_dim,
-                    self.num_memory_units,
-                    mode,
-                    dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = inputs
-        return (outputs, state, sequence_length)
diff --git a/modelscope/models/audio/tts/models/helpers.py b/modelscope/models/audio/tts/models/helpers.py
deleted file mode 100755
index 371000a4..00000000
--- a/modelscope/models/audio/tts/models/helpers.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-import tensorflow as tf
-
-
-class VarTestHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, batch_size, inputs, dim):
-        with tf.name_scope('VarTestHelper'):
-            self._batch_size = batch_size
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._inputs)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope('VarTestHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs = tf.concat([outputs, self._inputs[:, time, :]],
-                                    axis=-1)
-            return (finished, next_inputs, state)
-
-
-class VarTrainingHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, targets, inputs, dim):
-        with tf.name_scope('VarTrainingHelper'):
-            self._targets = targets  # [N, T_in, 1]
-            self._batch_size = tf.shape(inputs)[0]  # N
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._targets)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope(name or 'VarTrainingHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs = tf.concat(
-                [self._targets[:, time, :], self._inputs[:, time, :]], axis=-1)
-            return (finished, next_inputs, state)
-
-
-class VarTrainingSSHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, targets, inputs, dim, global_step, schedule_begin,
-                 alpha, decay_steps):
-        with tf.name_scope('VarTrainingSSHelper'):
-            self._targets = targets  # [N, T_in, 1]
-            self._batch_size = tf.shape(inputs)[0]  # N
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._targets)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-            # for schedule sampling
-            self._global_step = global_step
-            self._schedule_begin = schedule_begin
-            self._alpha = alpha
-            self._decay_steps = decay_steps
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        self._ratio = _tf_decay(self._global_step, self._schedule_begin,
-                                self._alpha, self._decay_steps)
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope(name or 'VarTrainingHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs_tmp = tf.cond(
-                tf.less(
-                    tf.random_uniform([], minval=0, maxval=1,
-                                      dtype=tf.float32), self._ratio),
-                lambda: self._targets[:, time, :], lambda: outputs)
-            next_inputs = tf.concat(
-                [next_inputs_tmp, self._inputs[:, time, :]], axis=-1)
-            return (finished, next_inputs, state)
-
-
-def _go_frames(batch_size, dim, init_inputs):
-    '''Returns all-zero <GO> frames for a given batch size and output dimension'''
-    return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs],
-                     axis=-1)
-
-
-def _tf_decay(global_step, schedule_begin, alpha, decay_steps):
-    tfr = tf.train.exponential_decay(
-        1.0,
-        global_step=global_step - schedule_begin,
-        decay_steps=decay_steps,
-        decay_rate=alpha,
-        name='tfr_decay')
-    final_tfr = tf.cond(
-        tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr)
-    return final_tfr
diff --git a/modelscope/models/audio/tts/models/models/__init__.py b/modelscope/models/audio/tts/models/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/audio/tts/models/models/hifigan/__init__.py b/modelscope/models/audio/tts/models/models/hifigan/__init__.py
new file mode 100644
index 00000000..ae9d10ea
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/hifigan/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .hifigan import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/models/hifigan/hifigan.py b/modelscope/models/audio/tts/models/models/hifigan/hifigan.py
new file mode 100755
index 00000000..0f950539
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/hifigan/hifigan.py
@@ -0,0 +1,238 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed from https://github.com/jik876/hifi-gan
+
+from distutils.version import LooseVersion
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+
+from modelscope.models.audio.tts.models.utils import get_padding, init_weights
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')
+
+
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+
+    Returns:
+        Tensor: Magnitude spectrogram (B).
+
+    """
+    if is_pytorch_17plus:
+        x_stft = torch.stft(
+            x, fft_size, hop_size, win_length, window, return_complex=False)
+    else:
+        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
+    real = x_stft[..., 0]
+    imag = x_stft[..., 1]
+
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
+
+
+LRELU_SLOPE = 0.1
+
+
+def get_padding_casual(kernel_size, dilation=1):
+    return int(kernel_size * dilation - dilation)
+
+
+class Conv1dCasual(torch.nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 padding_mode='zeros'):
+        super(Conv1dCasual, self).__init__()
+        self.pad = padding
+        self.conv1d = weight_norm(
+            Conv1d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=0,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+                padding_mode=padding_mode))
+        self.conv1d.apply(init_weights)
+
+    def forward(self, x):  # bdt
+        # described starting from the last dimension and moving forward.
+        x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
+        x = self.conv1d(x)
+        return x
+
+    def remove_weight_norm(self):
+        remove_weight_norm(self.conv1d)
+
+
+class ConvTranspose1dCausal(torch.nn.Module):
+    """CausalConvTranspose1d module with customized initialization."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        """Initialize CausalConvTranspose1d module."""
+        super(ConvTranspose1dCausal, self).__init__()
+        self.deconv = weight_norm(
+            ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
+        self.stride = stride
+        self.deconv.apply(init_weights)
+        self.pad = kernel_size - stride
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
+        """
+        # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
+        return self.deconv(x)[:, :, :-self.pad]
+
+    def remove_weight_norm(self):
+        remove_weight_norm(self.deconv)
+
+
+class ResBlock1(torch.nn.Module):
+
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            Conv1dCasual(
+                channels,
+                channels,
+                kernel_size,
+                1,
+                dilation=dilation[i],
+                padding=get_padding_casual(kernel_size, dilation[i]))
+            for i in range(len(dilation))
+        ])
+
+        self.convs2 = nn.ModuleList([
+            Conv1dCasual(
+                channels,
+                channels,
+                kernel_size,
+                1,
+                dilation=1,
+                padding=get_padding_casual(kernel_size, 1))
+            for i in range(len(dilation))
+        ])
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            layer.remove_weight_norm()
+        for layer in self.convs2:
+            layer.remove_weight_norm()
+
+
+class Generator(torch.nn.Module):
+
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        logger.info('num_kernels={}, num_upsamples={}'.format(
+            self.num_kernels, self.num_upsamples))
+        self.conv_pre = Conv1dCasual(
+            80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        self.repeat_ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+                zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            upsample = nn.Sequential(
+                nn.Upsample(mode='nearest', scale_factor=u),
+                nn.LeakyReLU(LRELU_SLOPE),
+                Conv1dCasual(
+                    h.upsample_initial_channel // (2**i),
+                    h.upsample_initial_channel // (2**(i + 1)),
+                    kernel_size=7,
+                    stride=1,
+                    padding=7 - 1))
+            self.repeat_ups.append(upsample)
+            self.ups.append(
+                ConvTranspose1dCausal(
+                    h.upsample_initial_channel // (2**i),
+                    h.upsample_initial_channel // (2**(i + 1)),
+                    k,
+                    u,
+                    padding=(k - u) // 2))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+
+        self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = torch.sin(x) + x
+            # transconv
+            x1 = F.leaky_relu(x, LRELU_SLOPE)
+            x1 = self.ups[i](x1)
+            # repeat
+            x2 = self.repeat_ups[i](x)
+            x = x1 + x2
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        logger.info('Removing weight norm...')
+        for layer in self.ups:
+            layer.remove_weight_norm()
+        for layer in self.repeat_ups:
+            layer[-1].remove_weight_norm()
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        self.conv_pre.remove_weight_norm()
+        self.conv_post.remove_weight_norm()
diff --git a/modelscope/models/audio/tts/models/models/sambert/__init__.py b/modelscope/models/audio/tts/models/models/sambert/__init__.py
new file mode 100644
index 00000000..f0bf5290
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .kantts_sambert import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/models/sambert/adaptors.py b/modelscope/models/audio/tts/models/models/sambert/adaptors.py
new file mode 100644
index 00000000..c171a1db
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/adaptors.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .base import Prenet
+from .fsmn import FsmnEncoderV2
+
+
+class LengthRegulator(nn.Module):
+
+    def __init__(self, r=1):
+        super(LengthRegulator, self).__init__()
+
+        self.r = r
+
+    def forward(self, inputs, durations, masks=None):
+        reps = (durations + 0.5).long()
+        output_lens = reps.sum(dim=1)
+        max_len = output_lens.max()
+        reps_cumsum = torch.cumsum(
+            F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :]
+        range_ = torch.arange(max_len).to(inputs.device)[None, :, None]
+        mult = ((reps_cumsum[:, :, :-1] <= range_)
+                & (reps_cumsum[:, :, 1:] > range_))  # yapf:disable
+        mult = mult.float()
+        out = torch.matmul(mult, inputs)
+
+        if masks is not None:
+            out = out.masked_fill(masks.unsqueeze(-1), 0.0)
+
+        seq_len = out.size(1)
+        padding = self.r - int(seq_len) % self.r
+        if (padding < self.r):
+            out = F.pad(
+                out.transpose(1, 2), (0, padding, 0, 0, 0, 0), value=0.0)
+            out = out.transpose(1, 2)
+
+        return out, output_lens
+
+
+class VarRnnARPredictor(nn.Module):
+
+    def __init__(self, cond_units, prenet_units, rnn_units):
+        super(VarRnnARPredictor, self).__init__()
+
+        self.prenet = Prenet(1, prenet_units)
+        self.lstm = nn.LSTM(
+            prenet_units[-1] + cond_units,
+            rnn_units,
+            num_layers=2,
+            batch_first=True,
+            bidirectional=False)
+        self.fc = nn.Linear(rnn_units, 1)
+
+    def forward(self, inputs, cond, h=None, masks=None):
+        x = torch.cat([self.prenet(inputs), cond], dim=-1)
+        # The input can also be a packed variable length sequence,
+        # here we just omit it for simplicity due to the mask and uni-directional lstm.
+        x, h_new = self.lstm(x, h)
+
+        x = self.fc(x).squeeze(-1)
+        x = F.relu(x)
+
+        if masks is not None:
+            x = x.masked_fill(masks, 0.0)
+
+        return x, h_new
+
+    def infer(self, cond, masks=None):
+        batch_size, length = cond.size(0), cond.size(1)
+
+        output = []
+        x = torch.zeros((batch_size, 1)).to(cond.device)
+        h = None
+
+        for i in range(length):
+            x, h = self.forward(x.unsqueeze(1), cond[:, i:i + 1, :], h=h)
+            output.append(x)
+
+        output = torch.cat(output, dim=-1)
+
+        if masks is not None:
+            output = output.masked_fill(masks, 0.0)
+
+        return output
+
+
+class VarFsmnRnnNARPredictor(nn.Module):
+
+    def __init__(self, in_dim, filter_size, fsmn_num_layers, num_memory_units,
+                 ffn_inner_dim, dropout, shift, lstm_units):
+        super(VarFsmnRnnNARPredictor, self).__init__()
+
+        self.fsmn = FsmnEncoderV2(filter_size, fsmn_num_layers, in_dim,
+                                  num_memory_units, ffn_inner_dim, dropout,
+                                  shift)
+        self.blstm = nn.LSTM(
+            num_memory_units,
+            lstm_units,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True)
+        self.fc = nn.Linear(2 * lstm_units, 1)
+
+    def forward(self, inputs, masks=None):
+        input_lengths = None
+        if masks is not None:
+            input_lengths = torch.sum((~masks).float(), dim=1).long()
+
+        x = self.fsmn(inputs, masks)
+
+        if input_lengths is not None:
+            x = nn.utils.rnn.pack_padded_sequence(
+                x,
+                input_lengths.tolist(),
+                batch_first=True,
+                enforce_sorted=False)
+            x, _ = self.blstm(x)
+            x, _ = nn.utils.rnn.pad_packed_sequence(
+                x, batch_first=True, total_length=inputs.size(1))
+        else:
+            x, _ = self.blstm(x)
+
+        x = self.fc(x).squeeze(-1)
+
+        if masks is not None:
+            x = x.masked_fill(masks, 0.0)
+
+        return x
diff --git a/modelscope/models/audio/tts/models/models/sambert/base.py b/modelscope/models/audio/tts/models/models/sambert/base.py
new file mode 100644
index 00000000..873aecbf
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/base.py
@@ -0,0 +1,369 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ScaledDotProductAttention(nn.Module):
+    """ Scaled Dot-Product Attention """
+
+    def __init__(self, temperature, dropatt=0.0):
+        super().__init__()
+        self.temperature = temperature
+        self.softmax = nn.Softmax(dim=2)
+        self.dropatt = nn.Dropout(dropatt)
+
+    def forward(self, q, k, v, mask=None):
+
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn / self.temperature
+
+        if mask is not None:
+            attn = attn.masked_fill(mask, -np.inf)
+
+        attn = self.softmax(attn)
+        attn = self.dropatt(attn)
+        output = torch.bmm(attn, v)
+
+        return output, attn
+
+
+class Prenet(nn.Module):
+
+    def __init__(self, in_units, prenet_units, out_units=0):
+        super(Prenet, self).__init__()
+
+        self.fcs = nn.ModuleList()
+        for in_dim, out_dim in zip([in_units] + prenet_units[:-1],
+                                   prenet_units):
+            self.fcs.append(nn.Linear(in_dim, out_dim))
+            self.fcs.append(nn.ReLU())
+            self.fcs.append(nn.Dropout(0.5))
+
+        if (out_units):
+            self.fcs.append(nn.Linear(prenet_units[-1], out_units))
+
+    def forward(self, input):
+        output = input
+        for layer in self.fcs:
+            output = layer(output)
+        return output
+
+
+class MultiHeadSelfAttention(nn.Module):
+    """ Multi-Head SelfAttention module """
+
+    def __init__(self, n_head, d_in, d_model, d_head, dropout, dropatt=0.0):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_in = d_in
+        self.d_model = d_model
+
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.w_qkv = nn.Linear(d_in, 3 * n_head * d_head)
+
+        self.attention = ScaledDotProductAttention(
+            temperature=np.power(d_head, 0.5), dropatt=dropatt)
+
+        self.fc = nn.Linear(n_head * d_head, d_model)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input, mask=None):
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_in, _ = input.size()
+
+        residual = input
+
+        x = self.layer_norm(input)
+        qkv = self.w_qkv(x)
+        q, k, v = qkv.chunk(3, -1)
+
+        q = q.view(sz_b, len_in, n_head, d_head)
+        k = k.view(sz_b, len_in, n_head, d_head)
+        v = v.view(sz_b, len_in, n_head, d_head)
+
+        q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+        k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+        v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+
+        if mask is not None:
+            mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
+        output, attn = self.attention(q, k, v, mask=mask)
+
+        output = output.view(n_head, sz_b, len_in, d_head)
+        output = (output.permute(1, 2, 0,
+                                 3).contiguous().view(sz_b, len_in,
+                                                      -1))  # b x l x (n*d)
+
+        output = self.dropout(self.fc(output))
+        if (output.size(-1) == residual.size(-1)):
+            output = output + residual
+
+        return output, attn
+
+
+class PositionwiseConvFeedForward(nn.Module):
+    """ A two-feed-forward-layer module """
+
+    def __init__(self,
+                 d_in,
+                 d_hid,
+                 kernel_size=(3, 1),
+                 dropout_inner=0.1,
+                 dropout=0.1):
+        super().__init__()
+        # Use Conv1D
+        # position-wise
+        self.w_1 = nn.Conv1d(
+            d_in,
+            d_hid,
+            kernel_size=kernel_size[0],
+            padding=(kernel_size[0] - 1) // 2,
+        )
+        # position-wise
+        self.w_2 = nn.Conv1d(
+            d_hid,
+            d_in,
+            kernel_size=kernel_size[1],
+            padding=(kernel_size[1] - 1) // 2,
+        )
+
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.dropout_inner = nn.Dropout(dropout_inner)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask=None):
+        residual = x
+        x = self.layer_norm(x)
+
+        output = x.transpose(1, 2)
+        output = F.relu(self.w_1(output))
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(1), 0)
+        output = self.dropout_inner(output)
+        output = self.w_2(output)
+        output = output.transpose(1, 2)
+        output = self.dropout(output)
+
+        output = output + residual
+
+        return output
+
+
+class FFTBlock(nn.Module):
+    """FFT Block"""
+
+    def __init__(self,
+                 d_in,
+                 d_model,
+                 n_head,
+                 d_head,
+                 d_inner,
+                 kernel_size,
+                 dropout,
+                 dropout_attn=0.0,
+                 dropout_relu=0.0):
+        super(FFTBlock, self).__init__()
+        self.slf_attn = MultiHeadSelfAttention(
+            n_head,
+            d_in,
+            d_model,
+            d_head,
+            dropout=dropout,
+            dropatt=dropout_attn)
+        self.pos_ffn = PositionwiseConvFeedForward(
+            d_model,
+            d_inner,
+            kernel_size,
+            dropout_inner=dropout_relu,
+            dropout=dropout)
+
+    def forward(self, input, mask=None, slf_attn_mask=None):
+        output, slf_attn = self.slf_attn(input, mask=slf_attn_mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        output = self.pos_ffn(output, mask=mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output, slf_attn
+
+
+class MultiHeadPNCAAttention(nn.Module):
+    """ Multi-Head Attention PNCA module """
+
+    def __init__(self, n_head, d_model, d_mem, d_head, dropout, dropatt=0.0):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_model = d_model
+        self.d_mem = d_mem
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+
+        self.w_x_qkv = nn.Linear(d_model, 3 * n_head * d_head)
+        self.fc_x = nn.Linear(n_head * d_head, d_model)
+
+        self.w_h_kv = nn.Linear(d_mem, 2 * n_head * d_head)
+        self.fc_h = nn.Linear(n_head * d_head, d_model)
+
+        self.attention = ScaledDotProductAttention(
+            temperature=np.power(d_head, 0.5), dropatt=dropatt)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def update_x_state(self, x):
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_x, _ = x.size()
+
+        x_qkv = self.w_x_qkv(x)
+        x_q, x_k, x_v = x_qkv.chunk(3, -1)
+
+        x_q = x_q.view(sz_b, len_x, n_head, d_head)
+        x_k = x_k.view(sz_b, len_x, n_head, d_head)
+        x_v = x_v.view(sz_b, len_x, n_head, d_head)
+
+        x_q = x_q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+        x_k = x_k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+        x_v = x_v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+
+        if (self.x_state_size):
+            self.x_k = torch.cat([self.x_k, x_k], dim=1)
+            self.x_v = torch.cat([self.x_v, x_v], dim=1)
+        else:
+            self.x_k = x_k
+            self.x_v = x_v
+
+        self.x_state_size += len_x
+
+        return x_q, x_k, x_v
+
+    def update_h_state(self, h):
+        if (self.h_state_size == h.size(1)):
+            return None, None
+
+        d_head, n_head = self.d_head, self.n_head
+
+        # H
+        sz_b, len_h, _ = h.size()
+
+        h_kv = self.w_h_kv(h)
+        h_k, h_v = h_kv.chunk(2, -1)
+
+        h_k = h_k.view(sz_b, len_h, n_head, d_head)
+        h_v = h_v.view(sz_b, len_h, n_head, d_head)
+
+        self.h_k = h_k.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head)
+        self.h_v = h_v.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head)
+
+        self.h_state_size += len_h
+
+        return h_k, h_v
+
+    def reset_state(self):
+        self.h_k = None
+        self.h_v = None
+        self.h_state_size = 0
+        self.x_k = None
+        self.x_v = None
+        self.x_state_size = 0
+
+    def forward(self, x, h, mask_x=None, mask_h=None):
+        residual = x
+        self.update_h_state(h)
+        x_q, x_k, x_v = self.update_x_state(self.layer_norm(x))
+
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_in, _ = x.size()
+
+        # X
+        if mask_x is not None:
+            mask_x = mask_x.repeat(n_head, 1, 1)  # (n*b) x .. x ..
+        output_x, attn_x = self.attention(x_q, self.x_k, self.x_v, mask=mask_x)
+
+        output_x = output_x.view(n_head, sz_b, len_in, d_head)
+        output_x = (output_x.permute(1, 2, 0,
+                                     3).contiguous().view(sz_b, len_in,
+                                                          -1))  # b x l x (n*d)
+        output_x = self.fc_x(output_x)
+
+        # H
+        if mask_h is not None:
+            mask_h = mask_h.repeat(n_head, 1, 1)
+        output_h, attn_h = self.attention(x_q, self.h_k, self.h_v, mask=mask_h)
+
+        output_h = output_h.view(n_head, sz_b, len_in, d_head)
+        output_h = (output_h.permute(1, 2, 0,
+                                     3).contiguous().view(sz_b, len_in,
+                                                          -1))  # b x l x (n*d)
+        output_h = self.fc_h(output_h)
+
+        output = output_x + output_h
+
+        output = self.dropout(output)
+
+        output = output + residual
+
+        return output, attn_x, attn_h
+
+
+class PNCABlock(nn.Module):
+    """PNCA Block"""
+
+    def __init__(self,
+                 d_model,
+                 d_mem,
+                 n_head,
+                 d_head,
+                 d_inner,
+                 kernel_size,
+                 dropout,
+                 dropout_attn=0.0,
+                 dropout_relu=0.0):
+        super(PNCABlock, self).__init__()
+        self.pnca_attn = MultiHeadPNCAAttention(
+            n_head,
+            d_model,
+            d_mem,
+            d_head,
+            dropout=dropout,
+            dropatt=dropout_attn)
+        self.pos_ffn = PositionwiseConvFeedForward(
+            d_model,
+            d_inner,
+            kernel_size,
+            dropout_inner=dropout_relu,
+            dropout=dropout)
+
+    def forward(self,
+                input,
+                memory,
+                mask=None,
+                pnca_x_attn_mask=None,
+                pnca_h_attn_mask=None):
+        output, pnca_attn_x, pnca_attn_h = self.pnca_attn(
+            input, memory, pnca_x_attn_mask, pnca_h_attn_mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        output = self.pos_ffn(output, mask=mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output, pnca_attn_x, pnca_attn_h
+
+    def reset_state(self):
+        self.pnca_attn.reset_state()
diff --git a/modelscope/models/audio/tts/models/models/sambert/fsmn.py b/modelscope/models/audio/tts/models/models/sambert/fsmn.py
new file mode 100644
index 00000000..c070ef35
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/fsmn.py
@@ -0,0 +1,126 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+FSMN Pytorch Version
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FeedForwardNet(nn.Module):
+    """ A two-feed-forward-layer module """
+
+    def __init__(self, d_in, d_hid, d_out, kernel_size=[1, 1], dropout=0.1):
+        super().__init__()
+
+        # Use Conv1D
+        # position-wise
+        self.w_1 = nn.Conv1d(
+            d_in,
+            d_hid,
+            kernel_size=kernel_size[0],
+            padding=(kernel_size[0] - 1) // 2,
+        )
+        # position-wise
+        self.w_2 = nn.Conv1d(
+            d_hid,
+            d_out,
+            kernel_size=kernel_size[1],
+            padding=(kernel_size[1] - 1) // 2,
+            bias=False)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        output = x.transpose(1, 2)
+        output = F.relu(self.w_1(output))
+        output = self.dropout(output)
+        output = self.w_2(output)
+        output = output.transpose(1, 2)
+
+        return output
+
+
+class MemoryBlockV2(nn.Module):
+
+    def __init__(self, d, filter_size, shift, dropout=0.0):
+        super(MemoryBlockV2, self).__init__()
+
+        left_padding = int(round((filter_size - 1) / 2))
+        right_padding = int((filter_size - 1) / 2)
+        if shift > 0:
+            left_padding += shift
+            right_padding -= shift
+
+        self.lp, self.rp = left_padding, right_padding
+
+        self.conv_dw = nn.Conv1d(d, d, filter_size, 1, 0, groups=d, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input, mask=None):
+        if mask is not None:
+            input = input.masked_fill(mask.unsqueeze(-1), 0)
+
+        x = F.pad(
+            input, (0, 0, self.lp, self.rp, 0, 0), mode='constant', value=0.0)
+        output = self.conv_dw(x.contiguous().transpose(
+            1, 2)).contiguous().transpose(1, 2)
+        output += input
+        output = self.dropout(output)
+
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output
+
+
+class FsmnEncoderV2(nn.Module):
+
+    def __init__(self,
+                 filter_size,
+                 fsmn_num_layers,
+                 input_dim,
+                 num_memory_units,
+                 ffn_inner_dim,
+                 dropout=0.0,
+                 shift=0):
+        super(FsmnEncoderV2, self).__init__()
+
+        self.filter_size = filter_size
+        self.fsmn_num_layers = fsmn_num_layers
+        self.num_memory_units = num_memory_units
+        self.ffn_inner_dim = ffn_inner_dim
+        self.dropout = dropout
+        self.shift = shift
+        if not isinstance(shift, list):
+            self.shift = [shift for _ in range(self.fsmn_num_layers)]
+
+        self.ffn_lst = nn.ModuleList()
+        self.ffn_lst.append(
+            FeedForwardNet(
+                input_dim, ffn_inner_dim, num_memory_units, dropout=dropout))
+        for i in range(1, fsmn_num_layers):
+            self.ffn_lst.append(
+                FeedForwardNet(
+                    num_memory_units,
+                    ffn_inner_dim,
+                    num_memory_units,
+                    dropout=dropout))
+
+        self.memory_block_lst = nn.ModuleList()
+        for i in range(fsmn_num_layers):
+            self.memory_block_lst.append(
+                MemoryBlockV2(num_memory_units, filter_size, self.shift[i],
+                              dropout))
+
+    def forward(self, input, mask=None):
+        x = F.dropout(input, self.dropout, self.training)
+        for (ffn, memory_block) in zip(self.ffn_lst, self.memory_block_lst):
+            context = ffn(x)
+            memory = memory_block(context, mask)
+            memory = F.dropout(memory, self.dropout, self.training)
+            if (memory.size(-1) == x.size(-1)):
+                memory += x
+            x = memory
+
+        return x
diff --git a/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py b/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
new file mode 100644
index 00000000..3837a2e8
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
@@ -0,0 +1,718 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.audio.tts.models.utils import get_mask_from_lengths
+from .adaptors import (LengthRegulator, VarFsmnRnnNARPredictor,
+                       VarRnnARPredictor)
+from .base import FFTBlock, PNCABlock, Prenet
+from .fsmn import FsmnEncoderV2
+from .positions import DurSinusoidalPositionEncoder, SinusoidalPositionEncoder
+
+
+class SelfAttentionEncoder(nn.Module):
+
+    def __init__(self, n_layer, d_in, d_model, n_head, d_head, d_inner,
+                 dropout, dropout_att, dropout_relu, position_encoder):
+        super(SelfAttentionEncoder, self).__init__()
+
+        self.d_in = d_in
+        self.d_model = d_model
+        self.dropout = dropout
+        d_in_lst = [d_in] + [d_model] * (n_layer - 1)
+        self.fft = nn.ModuleList([
+            FFTBlock(d, d_model, n_head, d_head, d_inner, (3, 1), dropout,
+                     dropout_att, dropout_relu) for d in d_in_lst
+        ])
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.position_enc = position_encoder
+
+    def forward(self, input, mask=None, return_attns=False):
+        input *= self.d_model**0.5
+        if (isinstance(self.position_enc, SinusoidalPositionEncoder)):
+            input = self.position_enc(input)
+        else:
+            raise NotImplementedError('modelscope error: position_enc invalid')
+
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        enc_slf_attn_list = []
+        max_len = input.size(1)
+        if mask is not None:
+            slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            slf_attn_mask = None
+
+        enc_output = input
+        for id, layer in enumerate(self.fft):
+            enc_output, enc_slf_attn = layer(
+                enc_output, mask=mask, slf_attn_mask=slf_attn_mask)
+            if return_attns:
+                enc_slf_attn_list += [enc_slf_attn]
+
+        enc_output = self.ln(enc_output)
+
+        return enc_output, enc_slf_attn_list
+
+
+class HybridAttentionDecoder(nn.Module):
+
+    def __init__(self, d_in, prenet_units, n_layer, d_model, d_mem, n_head,
+                 d_head, d_inner, dropout, dropout_att, dropout_relu, d_out):
+        super(HybridAttentionDecoder, self).__init__()
+
+        self.d_model = d_model
+        self.dropout = dropout
+        self.prenet = Prenet(d_in, prenet_units, d_model)
+        self.dec_in_proj = nn.Linear(d_model + d_mem, d_model)
+        self.pnca = nn.ModuleList([
+            PNCABlock(d_model, d_mem, n_head, d_head, d_inner, (1, 1), dropout,
+                      dropout_att, dropout_relu) for _ in range(n_layer)
+        ])
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.dec_out_proj = nn.Linear(d_model, d_out)
+
+    def reset_state(self):
+        for layer in self.pnca:
+            layer.reset_state()
+
+    def get_pnca_attn_mask(self,
+                           device,
+                           max_len,
+                           x_band_width,
+                           h_band_width,
+                           mask=None):
+        if mask is not None:
+            pnca_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            pnca_attn_mask = None
+
+        range_ = torch.arange(max_len).to(device)
+        x_start = torch.clamp_min(range_ - x_band_width, 0)[None, None, :]
+        x_end = (range_ + 1)[None, None, :]
+        h_start = range_[None, None, :]
+        h_end = torch.clamp_max(range_ + h_band_width + 1,
+                                max_len + 1)[None, None, :]
+
+        pnca_x_attn_mask = ~((x_start <= range_[None, :, None])
+                             & (x_end > range_[None, :, None])).transpose(1, 2)  # yapf:disable
+        pnca_h_attn_mask = ~((h_start <= range_[None, :, None])
+                             & (h_end > range_[None, :, None])).transpose(1, 2)  # yapf:disable
+
+        if pnca_attn_mask is not None:
+            pnca_x_attn_mask = (pnca_x_attn_mask | pnca_attn_mask)
+            pnca_h_attn_mask = (pnca_h_attn_mask | pnca_attn_mask)
+            pnca_x_attn_mask = pnca_x_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False)
+            pnca_h_attn_mask = pnca_h_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False)
+
+        return pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask
+
+    # must call reset_state before
+    def forward(self,
+                input,
+                memory,
+                x_band_width,
+                h_band_width,
+                mask=None,
+                return_attns=False):
+        input = self.prenet(input)
+        input = torch.cat([memory, input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        if mask is not None:
+            input = input.masked_fill(mask.unsqueeze(-1), 0)
+
+        input *= self.d_model**0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        max_len = input.size(1)
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, mask)
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                mask=mask,
+                pnca_x_attn_mask=pnca_x_attn_mask,
+                pnca_h_attn_mask=pnca_h_attn_mask)
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+    # must call reset_state before when step == 0
+    def infer(self,
+              step,
+              input,
+              memory,
+              x_band_width,
+              h_band_width,
+              mask=None,
+              return_attns=False):
+        max_len = memory.size(1)
+
+        input = self.prenet(input)
+        input = torch.cat([memory[:, step:step + 1, :], input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        input *= self.d_model**0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, mask)
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            if mask is not None:
+                mask_step = mask[:, step:step + 1]
+            else:
+                mask_step = None
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                mask=mask_step,
+                pnca_x_attn_mask=pnca_x_attn_mask[:,
+                                                  step:step + 1, :(step + 1)],
+                pnca_h_attn_mask=pnca_h_attn_mask[:, step:step + 1, :])
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+
+class TextFftEncoder(nn.Module):
+
+    def __init__(self, config, ling_unit_size):
+        super(TextFftEncoder, self).__init__()
+
+        # linguistic unit lookup table
+        nb_ling_sy = ling_unit_size['sy']
+        nb_ling_tone = ling_unit_size['tone']
+        nb_ling_syllable_flag = ling_unit_size['syllable_flag']
+        nb_ling_ws = ling_unit_size['word_segment']
+
+        max_len = config['am']['max_len']
+
+        d_emb = config['am']['embedding_dim']
+        nb_layers = config['am']['encoder_num_layers']
+        nb_heads = config['am']['encoder_num_heads']
+        d_model = config['am']['encoder_num_units']
+        d_head = d_model // nb_heads
+        d_inner = config['am']['encoder_ffn_inner_dim']
+        dropout = config['am']['encoder_dropout']
+        dropout_attn = config['am']['encoder_attention_dropout']
+        dropout_relu = config['am']['encoder_relu_dropout']
+        d_proj = config['am']['encoder_projection_units']
+
+        self.d_model = d_model
+
+        self.sy_emb = nn.Embedding(nb_ling_sy, d_emb)
+        self.tone_emb = nn.Embedding(nb_ling_tone, d_emb)
+        self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb)
+        self.ws_emb = nn.Embedding(nb_ling_ws, d_emb)
+
+        position_enc = SinusoidalPositionEncoder(max_len, d_emb)
+
+        self.ling_enc = SelfAttentionEncoder(nb_layers, d_emb, d_model,
+                                             nb_heads, d_head, d_inner,
+                                             dropout, dropout_attn,
+                                             dropout_relu, position_enc)
+
+        self.ling_proj = nn.Linear(d_model, d_proj, bias=False)
+
+    def forward(self, inputs_ling, masks=None, return_attns=False):
+        # Parse inputs_ling_seq
+        inputs_sy = inputs_ling[:, :, 0]
+        inputs_tone = inputs_ling[:, :, 1]
+        inputs_syllable_flag = inputs_ling[:, :, 2]
+        inputs_ws = inputs_ling[:, :, 3]
+
+        # Lookup table
+        sy_embedding = self.sy_emb(inputs_sy)
+        tone_embedding = self.tone_emb(inputs_tone)
+        syllable_flag_embedding = self.syllable_flag_emb(inputs_syllable_flag)
+        ws_embedding = self.ws_emb(inputs_ws)
+
+        ling_embedding = sy_embedding + tone_embedding + syllable_flag_embedding + ws_embedding
+
+        enc_output, enc_slf_attn_list = self.ling_enc(ling_embedding, masks,
+                                                      return_attns)
+
+        enc_output = self.ling_proj(enc_output)
+
+        return enc_output, enc_slf_attn_list
+
+
+class VarianceAdaptor(nn.Module):
+
+    def __init__(self, config):
+        super(VarianceAdaptor, self).__init__()
+
+        input_dim = config['am']['encoder_projection_units'] + config['am'][
+            'emotion_units'] + config['am']['speaker_units']
+        filter_size = config['am']['predictor_filter_size']
+        fsmn_num_layers = config['am']['predictor_fsmn_num_layers']
+        num_memory_units = config['am']['predictor_num_memory_units']
+        ffn_inner_dim = config['am']['predictor_ffn_inner_dim']
+        dropout = config['am']['predictor_dropout']
+        shift = config['am']['predictor_shift']
+        lstm_units = config['am']['predictor_lstm_units']
+
+        dur_pred_prenet_units = config['am']['dur_pred_prenet_units']
+        dur_pred_lstm_units = config['am']['dur_pred_lstm_units']
+
+        self.pitch_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size,
+                                                      fsmn_num_layers,
+                                                      num_memory_units,
+                                                      ffn_inner_dim, dropout,
+                                                      shift, lstm_units)
+        self.energy_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size,
+                                                       fsmn_num_layers,
+                                                       num_memory_units,
+                                                       ffn_inner_dim, dropout,
+                                                       shift, lstm_units)
+        self.duration_predictor = VarRnnARPredictor(input_dim,
+                                                    dur_pred_prenet_units,
+                                                    dur_pred_lstm_units)
+
+        self.length_regulator = LengthRegulator(
+            config['am']['outputs_per_step'])
+        self.dur_position_encoder = DurSinusoidalPositionEncoder(
+            config['am']['encoder_projection_units'],
+            config['am']['outputs_per_step'])
+
+        self.pitch_emb = nn.Conv1d(
+            1,
+            config['am']['encoder_projection_units'],
+            kernel_size=9,
+            padding=4)
+        self.energy_emb = nn.Conv1d(
+            1,
+            config['am']['encoder_projection_units'],
+            kernel_size=9,
+            padding=4)
+
+    def forward(self,
+                inputs_text_embedding,
+                inputs_emo_embedding,
+                inputs_spk_embedding,
+                masks=None,
+                output_masks=None,
+                duration_targets=None,
+                pitch_targets=None,
+                energy_targets=None):
+
+        batch_size = inputs_text_embedding.size(0)
+
+        variance_predictor_inputs = torch.cat([
+            inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding
+        ], dim=-1)  # yapf:disable
+
+        pitch_predictions = self.pitch_predictor(variance_predictor_inputs,
+                                                 masks)
+        energy_predictions = self.energy_predictor(variance_predictor_inputs,
+                                                   masks)
+
+        if pitch_targets is not None:
+            pitch_embeddings = self.pitch_emb(
+                pitch_targets.unsqueeze(1)).transpose(1, 2)
+        else:
+            pitch_embeddings = self.pitch_emb(
+                pitch_predictions.unsqueeze(1)).transpose(1, 2)
+
+        if energy_targets is not None:
+            energy_embeddings = self.energy_emb(
+                energy_targets.unsqueeze(1)).transpose(1, 2)
+        else:
+            energy_embeddings = self.energy_emb(
+                energy_predictions.unsqueeze(1)).transpose(1, 2)
+
+        inputs_text_embedding_aug = inputs_text_embedding + pitch_embeddings + energy_embeddings
+        duration_predictor_cond = torch.cat([
+            inputs_text_embedding_aug, inputs_spk_embedding,
+            inputs_emo_embedding
+        ], dim=-1)  # yapf:disable
+        if duration_targets is not None:
+            duration_predictor_go_frame = torch.zeros(batch_size, 1).to(
+                inputs_text_embedding.device)
+            duration_predictor_input = torch.cat([
+                duration_predictor_go_frame, duration_targets[:, :-1].float()
+            ], dim=-1)  # yapf:disable
+            duration_predictor_input = torch.log(duration_predictor_input + 1)
+            log_duration_predictions, _ = self.duration_predictor(
+                duration_predictor_input.unsqueeze(-1),
+                duration_predictor_cond,
+                masks=masks)
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+        else:
+            log_duration_predictions = self.duration_predictor.infer(
+                duration_predictor_cond, masks=masks)
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+
+        if duration_targets is not None:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug,
+                duration_targets,
+                masks=output_masks)
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_targets, masks=output_masks)
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_targets, masks=output_masks)
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_targets, masks=output_masks)
+
+        else:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug,
+                duration_predictions,
+                masks=output_masks)
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_predictions, masks=output_masks)
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_predictions, masks=output_masks)
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_predictions, masks=output_masks)
+
+        LR_text_outputs = LR_text_outputs + LR_position_embeddings
+
+        return (LR_text_outputs, LR_emo_outputs, LR_spk_outputs,
+                LR_length_rounded, log_duration_predictions, pitch_predictions,
+                energy_predictions)
+
+
+class MelPNCADecoder(nn.Module):
+
+    def __init__(self, config):
+        super(MelPNCADecoder, self).__init__()
+
+        prenet_units = config['am']['decoder_prenet_units']
+        nb_layers = config['am']['decoder_num_layers']
+        nb_heads = config['am']['decoder_num_heads']
+        d_model = config['am']['decoder_num_units']
+        d_head = d_model // nb_heads
+        d_inner = config['am']['decoder_ffn_inner_dim']
+        dropout = config['am']['decoder_dropout']
+        dropout_attn = config['am']['decoder_attention_dropout']
+        dropout_relu = config['am']['decoder_relu_dropout']
+        outputs_per_step = config['am']['outputs_per_step']
+
+        d_mem = config['am'][
+            'encoder_projection_units'] * outputs_per_step + config['am'][
+                'emotion_units'] + config['am']['speaker_units']
+        d_mel = config['am']['num_mels']
+
+        self.d_mel = d_mel
+        self.r = outputs_per_step
+        self.nb_layers = nb_layers
+
+        self.mel_dec = HybridAttentionDecoder(d_mel, prenet_units, nb_layers,
+                                              d_model, d_mem, nb_heads, d_head,
+                                              d_inner, dropout, dropout_attn,
+                                              dropout_relu,
+                                              d_mel * outputs_per_step)
+
+    def forward(self,
+                memory,
+                x_band_width,
+                h_band_width,
+                target=None,
+                mask=None,
+                return_attns=False):
+        batch_size = memory.size(0)
+        go_frame = torch.zeros((batch_size, 1, self.d_mel)).to(memory.device)
+
+        if target is not None:
+            self.mel_dec.reset_state()
+            input = target[:, self.r - 1::self.r, :]
+            input = torch.cat([go_frame, input], dim=1)[:, :-1, :]
+            dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list = self.mel_dec(
+                input,
+                memory,
+                x_band_width,
+                h_band_width,
+                mask=mask,
+                return_attns=return_attns)
+
+        else:
+            dec_output = []
+            dec_pnca_attn_x_list = [[] for _ in range(self.nb_layers)]
+            dec_pnca_attn_h_list = [[] for _ in range(self.nb_layers)]
+            self.mel_dec.reset_state()
+            input = go_frame
+            for step in range(memory.size(1)):
+                dec_output_step, dec_pnca_attn_x_step, dec_pnca_attn_h_step = self.mel_dec.infer(
+                    step,
+                    input,
+                    memory,
+                    x_band_width,
+                    h_band_width,
+                    mask=mask,
+                    return_attns=return_attns)
+                input = dec_output_step[:, :, -self.d_mel:]
+
+                dec_output.append(dec_output_step)
+                for layer_id, (pnca_x_attn, pnca_h_attn) in enumerate(
+                        zip(dec_pnca_attn_x_step, dec_pnca_attn_h_step)):
+                    left = memory.size(1) - pnca_x_attn.size(-1)
+                    if (left > 0):
+                        padding = torch.zeros(
+                            (pnca_x_attn.size(0), 1, left)).to(pnca_x_attn)
+                        pnca_x_attn = torch.cat([pnca_x_attn, padding], dim=-1)
+                    dec_pnca_attn_x_list[layer_id].append(pnca_x_attn)
+                    dec_pnca_attn_h_list[layer_id].append(pnca_h_attn)
+
+            dec_output = torch.cat(dec_output, dim=1)
+            for layer_id in range(self.nb_layers):
+                dec_pnca_attn_x_list[layer_id] = torch.cat(
+                    dec_pnca_attn_x_list[layer_id], dim=1)
+                dec_pnca_attn_h_list[layer_id] = torch.cat(
+                    dec_pnca_attn_h_list[layer_id], dim=1)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+
+class PostNet(nn.Module):
+
+    def __init__(self, config):
+        super(PostNet, self).__init__()
+
+        self.filter_size = config['am']['postnet_filter_size']
+        self.fsmn_num_layers = config['am']['postnet_fsmn_num_layers']
+        self.num_memory_units = config['am']['postnet_num_memory_units']
+        self.ffn_inner_dim = config['am']['postnet_ffn_inner_dim']
+        self.dropout = config['am']['postnet_dropout']
+        self.shift = config['am']['postnet_shift']
+        self.lstm_units = config['am']['postnet_lstm_units']
+        self.num_mels = config['am']['num_mels']
+
+        self.fsmn = FsmnEncoderV2(self.filter_size, self.fsmn_num_layers,
+                                  self.num_mels, self.num_memory_units,
+                                  self.ffn_inner_dim, self.dropout, self.shift)
+        self.lstm = nn.LSTM(
+            self.num_memory_units,
+            self.lstm_units,
+            num_layers=1,
+            batch_first=True)
+        self.fc = nn.Linear(self.lstm_units, self.num_mels)
+
+    def forward(self, x, mask=None):
+        postnet_fsmn_output = self.fsmn(x, mask)
+        # The input can also be a packed variable length sequence,
+        # here we just omit it for simpliciy due to the mask and uni-directional lstm.
+        postnet_lstm_output, _ = self.lstm(postnet_fsmn_output)
+        mel_residual_output = self.fc(postnet_lstm_output)
+
+        return mel_residual_output
+
+
+def mel_recon_loss_fn(output_lengths,
+                      mel_targets,
+                      dec_outputs,
+                      postnet_outputs=None):
+    mae_loss = nn.L1Loss(reduction='none')
+
+    output_masks = get_mask_from_lengths(
+        output_lengths, max_len=mel_targets.size(1))
+    output_masks = ~output_masks
+    valid_outputs = output_masks.sum()
+
+    mel_loss_ = torch.sum(
+        mae_loss(mel_targets, dec_outputs) * output_masks.unsqueeze(-1)) / (
+            valid_outputs * mel_targets.size(-1))
+
+    if postnet_outputs is not None:
+        mel_loss = torch.sum(
+            mae_loss(mel_targets, postnet_outputs)
+            * output_masks.unsqueeze(-1)) / (
+                valid_outputs * mel_targets.size(-1))
+    else:
+        mel_loss = 0.0
+
+    return mel_loss_, mel_loss
+
+
+def prosody_recon_loss_fn(input_lengths, duration_targets, pitch_targets,
+                          energy_targets, log_duration_predictions,
+                          pitch_predictions, energy_predictions):
+    mae_loss = nn.L1Loss(reduction='none')
+
+    input_masks = get_mask_from_lengths(
+        input_lengths, max_len=duration_targets.size(1))
+    input_masks = ~input_masks
+    valid_inputs = input_masks.sum()
+
+    dur_loss = torch.sum(
+        mae_loss(
+            torch.log(duration_targets.float() + 1), log_duration_predictions)
+        * input_masks) / valid_inputs
+    pitch_loss = torch.sum(
+        mae_loss(pitch_targets, pitch_predictions)
+        * input_masks) / valid_inputs
+    energy_loss = torch.sum(
+        mae_loss(energy_targets, energy_predictions)
+        * input_masks) / valid_inputs
+
+    return dur_loss, pitch_loss, energy_loss
+
+
+class KanTtsSAMBERT(nn.Module):
+
+    def __init__(self, config, ling_unit_size):
+        super(KanTtsSAMBERT, self).__init__()
+
+        self.text_encoder = TextFftEncoder(config, ling_unit_size)
+        self.spk_tokenizer = nn.Embedding(ling_unit_size['speaker'],
+                                          config['am']['speaker_units'])
+        self.emo_tokenizer = nn.Embedding(ling_unit_size['emotion'],
+                                          config['am']['emotion_units'])
+        self.variance_adaptor = VarianceAdaptor(config)
+        self.mel_decoder = MelPNCADecoder(config)
+        self.mel_postnet = PostNet(config)
+
+    def get_lfr_mask_from_lengths(self, lengths, max_len):
+        batch_size = lengths.size(0)
+        # padding according to the outputs_per_step
+        padded_lr_lengths = torch.zeros_like(lengths)
+        for i in range(batch_size):
+            len_item = int(lengths[i].item())
+            padding = self.mel_decoder.r - len_item % self.mel_decoder.r
+            if (padding < self.mel_decoder.r):
+                padded_lr_lengths[i] = (len_item
+                                        + padding) // self.mel_decoder.r
+            else:
+                padded_lr_lengths[i] = len_item // self.mel_decoder.r
+
+        return get_mask_from_lengths(
+            padded_lr_lengths, max_len=max_len // self.mel_decoder.r)
+
+    def forward(self,
+                inputs_ling,
+                inputs_emotion,
+                inputs_speaker,
+                input_lengths,
+                output_lengths=None,
+                mel_targets=None,
+                duration_targets=None,
+                pitch_targets=None,
+                energy_targets=None):
+
+        batch_size = inputs_ling.size(0)
+
+        input_masks = get_mask_from_lengths(
+            input_lengths, max_len=inputs_ling.size(1))
+
+        text_hid, enc_sla_attn_lst = self.text_encoder(
+            inputs_ling, input_masks, return_attns=True)
+
+        emo_hid = self.emo_tokenizer(inputs_emotion)
+        spk_hid = self.spk_tokenizer(inputs_speaker)
+
+        if output_lengths is not None:
+            output_masks = get_mask_from_lengths(
+                output_lengths, max_len=mel_targets.size(1))
+        else:
+            output_masks = None
+
+        (LR_text_outputs, LR_emo_outputs, LR_spk_outputs, LR_length_rounded,
+         log_duration_predictions, pitch_predictions,
+         energy_predictions) = self.variance_adaptor(
+             text_hid,
+             emo_hid,
+             spk_hid,
+             masks=input_masks,
+             output_masks=output_masks,
+             duration_targets=duration_targets,
+             pitch_targets=pitch_targets,
+             energy_targets=energy_targets)
+
+        if output_lengths is not None:
+            lfr_masks = self.get_lfr_mask_from_lengths(
+                output_lengths, max_len=LR_text_outputs.size(1))
+        else:
+            output_masks = get_mask_from_lengths(
+                LR_length_rounded, max_len=LR_text_outputs.size(1))
+            lfr_masks = None
+
+        # LFR with the factor of outputs_per_step
+        LFR_text_inputs = LR_text_outputs.contiguous().view(
+            batch_size, -1, self.mel_decoder.r * text_hid.shape[-1])
+        LFR_emo_inputs = LR_emo_outputs.contiguous().view(
+            batch_size, -1,
+            self.mel_decoder.r * emo_hid.shape[-1])[:, :, :emo_hid.shape[-1]]
+        LFR_spk_inputs = LR_spk_outputs.contiguous().view(
+            batch_size, -1,
+            self.mel_decoder.r * spk_hid.shape[-1])[:, :, :spk_hid.shape[-1]]
+
+        memory = torch.cat([LFR_text_inputs, LFR_spk_inputs, LFR_emo_inputs],
+                           dim=-1)
+
+        if duration_targets is not None:
+            x_band_width = int(
+                duration_targets.float().masked_fill(input_masks, 0).max()
+                / self.mel_decoder.r + 0.5)
+            h_band_width = x_band_width
+        else:
+            x_band_width = int((torch.exp(log_duration_predictions) - 1).max()
+                               / self.mel_decoder.r + 0.5)
+            h_band_width = x_band_width
+
+        dec_outputs, pnca_x_attn_lst, pnca_h_attn_lst = self.mel_decoder(
+            memory,
+            x_band_width,
+            h_band_width,
+            target=mel_targets,
+            mask=lfr_masks,
+            return_attns=True)
+
+        # De-LFR with the factor of outputs_per_step
+        dec_outputs = dec_outputs.contiguous().view(batch_size, -1,
+                                                    self.mel_decoder.d_mel)
+
+        if output_masks is not None:
+            dec_outputs = dec_outputs.masked_fill(
+                output_masks.unsqueeze(-1), 0)
+
+        postnet_outputs = self.mel_postnet(dec_outputs,
+                                           output_masks) + dec_outputs
+        if output_masks is not None:
+            postnet_outputs = postnet_outputs.masked_fill(
+                output_masks.unsqueeze(-1), 0)
+
+        res = {
+            'x_band_width': x_band_width,
+            'h_band_width': h_band_width,
+            'enc_slf_attn_lst': enc_sla_attn_lst,
+            'pnca_x_attn_lst': pnca_x_attn_lst,
+            'pnca_h_attn_lst': pnca_h_attn_lst,
+            'dec_outputs': dec_outputs,
+            'postnet_outputs': postnet_outputs,
+            'LR_length_rounded': LR_length_rounded,
+            'log_duration_predictions': log_duration_predictions,
+            'pitch_predictions': pitch_predictions,
+            'energy_predictions': energy_predictions
+        }
+
+        res['LR_text_outputs'] = LR_text_outputs
+        res['LR_emo_outputs'] = LR_emo_outputs
+        res['LR_spk_outputs'] = LR_spk_outputs
+
+        return res
diff --git a/modelscope/models/audio/tts/models/models/sambert/positions.py b/modelscope/models/audio/tts/models/models/sambert/positions.py
new file mode 100644
index 00000000..9d1e375d
--- /dev/null
+++ b/modelscope/models/audio/tts/models/models/sambert/positions.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SinusoidalPositionEncoder(nn.Module):
+
+    def __init__(self, max_len, depth):
+        super(SinusoidalPositionEncoder, self).__init__()
+
+        self.max_len = max_len
+        self.depth = depth
+        self.position_enc = nn.Parameter(
+            self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0),
+            requires_grad=False)
+
+    def forward(self, input):
+        bz_in, len_in, _ = input.size()
+        if len_in > self.max_len:
+            self.max_len = len_in
+            self.position_enc.data = self.get_sinusoid_encoding_table(
+                self.max_len, self.depth).unsqueeze(0).to(input.device)
+
+        output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1)
+
+        return output
+
+    @staticmethod
+    def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
+        """ Sinusoid position encoding table """
+
+        def cal_angle(position, hid_idx):
+            return position / np.power(10000, hid_idx / float(d_hid / 2 - 1))
+
+        def get_posi_angle_vec(position):
+            return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)]
+
+        scaled_time_table = np.array(
+            [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)])
+
+        sinusoid_table = np.zeros((n_position, d_hid))
+        sinusoid_table[:, :d_hid // 2] = np.sin(scaled_time_table)
+        sinusoid_table[:, d_hid // 2:] = np.cos(scaled_time_table)
+
+        if padding_idx is not None:
+            # zero vector for padding dimension
+            sinusoid_table[padding_idx] = 0.0
+
+        return torch.FloatTensor(sinusoid_table)
+
+
+class DurSinusoidalPositionEncoder(nn.Module):
+
+    def __init__(self, depth, outputs_per_step):
+        super(DurSinusoidalPositionEncoder, self).__init__()
+
+        self.depth = depth
+        self.outputs_per_step = outputs_per_step
+
+        inv_timescales = [
+            np.power(10000, 2 * (hid_idx // 2) / depth)
+            for hid_idx in range(depth)
+        ]
+        self.inv_timescales = nn.Parameter(
+            torch.FloatTensor(inv_timescales), requires_grad=False)
+
+    def forward(self, durations, masks=None):
+        reps = (durations + 0.5).long()
+        output_lens = reps.sum(dim=1)
+        max_len = output_lens.max()
+        reps_cumsum = torch.cumsum(
+            F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :]
+        range_ = torch.arange(max_len).to(durations.device)[None, :, None]
+        mult = ((reps_cumsum[:, :, :-1] <= range_)
+                & (reps_cumsum[:, :, 1:] > range_))  # yapf:disable
+        mult = mult.float()
+        offsets = torch.matmul(mult,
+                               reps_cumsum[:,
+                                           0, :-1].unsqueeze(-1)).squeeze(-1)
+        dur_pos = range_[:, :, 0] - offsets + 1
+
+        if masks is not None:
+            assert masks.size(1) == dur_pos.size(1)
+            dur_pos = dur_pos.masked_fill(masks, 0.0)
+
+        seq_len = dur_pos.size(1)
+        padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step
+        if (padding < self.outputs_per_step):
+            dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0)
+
+        position_embedding = dur_pos[:, :, None] / self.inv_timescales[None,
+                                                                       None, :]
+        position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :,
+                                                                      0::2])
+        position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :,
+                                                                      1::2])
+
+        return position_embedding
diff --git a/modelscope/models/audio/tts/models/position.py b/modelscope/models/audio/tts/models/position.py
deleted file mode 100755
index bca658dd..00000000
--- a/modelscope/models/audio/tts/models/position.py
+++ /dev/null
@@ -1,174 +0,0 @@
-"""Define position encoder classes."""
-
-import abc
-import math
-
-import tensorflow as tf
-
-from .reducer import SumReducer
-
-
-class PositionEncoder(tf.keras.layers.Layer):
-    """Base class for position encoders."""
-
-    def __init__(self, reducer=None, **kwargs):
-        """Initializes the position encoder.
-        Args:
-          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
-            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
-          **kwargs: Additional layer keyword arguments.
-        """
-        super(PositionEncoder, self).__init__(**kwargs)
-        if reducer is None:
-            reducer = SumReducer(dtype=kwargs.get('dtype'))
-        self.reducer = reducer
-
-    def call(self, inputs, position=None):  # pylint: disable=arguments-differ
-        """Add position encodings to :obj:`inputs`.
-        Args:
-          inputs: The inputs to encode.
-          position: The single position to encode, to use when this layer is called
-            step by step.
-        Returns:
-          A ``tf.Tensor`` whose shape depends on the configured ``reducer``.
-        """
-        batch_size = tf.shape(inputs)[0]
-        timesteps = tf.shape(inputs)[1]
-        input_dim = inputs.shape[-1].value
-        positions = tf.range(timesteps) + 1 if position is None else [position]
-        position_encoding = self._encode([positions], input_dim)
-        position_encoding = tf.tile(position_encoding, [batch_size, 1, 1])
-        return self.reducer([inputs, position_encoding])
-
-    @abc.abstractmethod
-    def _encode(self, positions, depth):
-        """Creates position encodings.
-        Args:
-          positions: The positions to encode of shape :math:`[B, ...]`.
-          depth: The encoding depth :math:`D`.
-        Returns:
-          A ``tf.Tensor`` of shape :math:`[B, ..., D]`.
-        """
-        raise NotImplementedError()
-
-
-class PositionEmbedder(PositionEncoder):
-    """Encodes position with a lookup table."""
-
-    def __init__(self, maximum_position=128, reducer=None, **kwargs):
-        """Initializes the position encoder.
-        Args:
-          maximum_position: The maximum position to embed. Positions greater
-            than this value will be set to :obj:`maximum_position`.
-          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
-            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
-          **kwargs: Additional layer keyword arguments.
-        """
-        super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs)
-        self.maximum_position = maximum_position
-        self.embedding = None
-
-    def build(self, input_shape):
-        shape = [self.maximum_position + 1, input_shape[-1]]
-        self.embedding = self.add_weight('position_embedding', shape)
-        super(PositionEmbedder, self).build(input_shape)
-
-    def _encode(self, positions, depth):
-        positions = tf.minimum(positions, self.maximum_position)
-        return tf.nn.embedding_lookup(self.embedding, positions)
-
-
-class SinusoidalPositionEncoder(PositionEncoder):
-    """Encodes positions with sine waves as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def _encode(self, positions, depth):
-        if depth % 2 != 0:
-            raise ValueError(
-                'SinusoidalPositionEncoder expects the depth to be divisble '
-                'by 2 but got %d' % depth)
-
-        batch_size = tf.shape(positions)[0]
-        positions = tf.cast(positions, tf.float32)
-
-        log_timescale_increment = math.log(10000) / (depth / 2 - 1)
-        inv_timescales = tf.exp(
-            tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment)
-        inv_timescales = tf.reshape(
-            tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2])
-        scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims(
-            inv_timescales, 1)
-        encoding = tf.concat(
-            [tf.sin(scaled_time), tf.cos(scaled_time)], axis=2)
-        return tf.cast(encoding, self.dtype)
-
-
-class SinusodalPositionalEncoding(tf.keras.layers.Layer):
-
-    def __init__(self, name='SinusodalPositionalEncoding'):
-        super(SinusodalPositionalEncoding, self).__init__(name=name)
-
-    @staticmethod
-    def positional_encoding(len, dim, step=1.):
-        """
-        :param len: int scalar
-        :param dim: int scalar
-        :param step:
-        :return: position embedding
-        """
-        pos_mat = tf.tile(
-            tf.expand_dims(
-                tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32)
-                * step,
-                axis=-1), [1, dim])
-        dim_mat = tf.tile(
-            tf.expand_dims(
-                tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
-                axis=0), [len, 1])
-        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
-        pos_encoding = tf.where(  # [time, dims]
-            tf.math.equal(tf.math.mod(dim_mat_int, 2), 0),
-            x=tf.math.sin(
-                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
-            y=tf.math.cos(pos_mat
-                          / tf.pow(10000.,
-                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
-        return pos_encoding
-
-
-class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer):
-
-    def __init__(self, name='BatchSinusodalPositionalEncoding'):
-        super(BatchSinusodalPositionalEncoding, self).__init__(name=name)
-
-    @staticmethod
-    def positional_encoding(batch_size, len, dim, pos_mat, step=1.):
-        """
-        :param len: int scalar
-        :param dim: int scalar
-        :param step:
-        :param pos_mat: [B, len] = [len, 1] * dim
-        :return: position embedding
-        """
-        pos_mat = tf.tile(
-            tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1),
-            [1, 1, dim])  # [B, len, dim]
-
-        dim_mat = tf.tile(
-            tf.expand_dims(
-                tf.expand_dims(
-                    tf.range(
-                        0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
-                    axis=0),
-                axis=0), [batch_size, len, 1])  # [B, len, dim]
-
-        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
-        pos_encoding = tf.where(  # [B, time, dims]
-            tf.math.equal(tf.mod(dim_mat_int, 2), 0),
-            x=tf.math.sin(
-                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
-            y=tf.math.cos(pos_mat
-                          / tf.pow(10000.,
-                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
-        return pos_encoding
diff --git a/modelscope/models/audio/tts/models/reducer.py b/modelscope/models/audio/tts/models/reducer.py
deleted file mode 100755
index a4c9ae17..00000000
--- a/modelscope/models/audio/tts/models/reducer.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""Define reducers: objects that merge inputs."""
-
-import abc
-import functools
-
-import tensorflow as tf
-
-
-def pad_in_time(x, padding_length):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension."""
-    return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
-
-
-def align_in_time(x, length):
-    """Aligns the time dimension of :obj:`x` with :obj:`length`."""
-    time_dim = tf.shape(x)[1]
-    return tf.cond(
-        tf.less(time_dim, length),
-        true_fn=lambda: pad_in_time(x, length - time_dim),
-        false_fn=lambda: x[:, :length])
-
-
-def pad_with_identity(x,
-                      sequence_length,
-                      max_sequence_length,
-                      identity_values=0,
-                      maxlen=None):
-    """Pads a tensor with identity values up to :obj:`max_sequence_length`.
-    Args:
-      x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``.
-      sequence_length: The true sequence length of :obj:`x`.
-      max_sequence_length: The sequence length up to which the tensor must contain
-        :obj:`identity values`.
-      identity_values: The identity value.
-      maxlen: Size of the output time dimension. Default is the maximum value in
-        obj:`max_sequence_length`.
-    Returns:
-      A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``.
-    """
-    if maxlen is None:
-        maxlen = tf.reduce_max(max_sequence_length)
-
-    mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype)
-    mask = tf.expand_dims(mask, axis=-1)
-    mask_combined = tf.sequence_mask(
-        max_sequence_length, maxlen=maxlen, dtype=x.dtype)
-    mask_combined = tf.expand_dims(mask_combined, axis=-1)
-
-    identity_mask = mask_combined * (1.0 - mask)
-
-    x = pad_in_time(x, maxlen - tf.shape(x)[1])
-    x = x * mask + (identity_mask * identity_values)
-
-    return x
-
-
-def pad_n_with_identity(inputs, sequence_lengths, identity_values=0):
-    """Pads each input tensors with identity values up to
-    ``max(sequence_lengths)`` for each batch.
-    Args:
-      inputs: A list of ``tf.Tensor``.
-      sequence_lengths: A list of sequence length.
-      identity_values: The identity value.
-    Returns:
-      A tuple ``(padded, max_sequence_length)`` which are respectively a list of
-      ``tf.Tensor`` where each tensor are padded with identity and the combined
-      sequence length.
-    """
-    max_sequence_length = tf.reduce_max(sequence_lengths, axis=0)
-    maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs])
-    padded = [
-        pad_with_identity(
-            x,
-            length,
-            max_sequence_length,
-            identity_values=identity_values,
-            maxlen=maxlen) for x, length in zip(inputs, sequence_lengths)
-    ]
-    return padded, max_sequence_length
-
-
-class Reducer(tf.keras.layers.Layer):
-    """Base class for reducers."""
-
-    def zip_and_reduce(self, x, y):
-        """Zips the :obj:`x` with :obj:`y` structures together and reduces all
-        elements. If the structures are nested, they will be flattened first.
-        Args:
-          x: The first structure.
-          y: The second structure.
-        Returns:
-          The same structure as :obj:`x` and :obj:`y` where each element from
-          :obj:`x` is reduced with the correspond element from :obj:`y`.
-        Raises:
-          ValueError: if the two structures are not the same.
-        """
-        tf.nest.assert_same_structure(x, y)
-        x_flat = tf.nest.flatten(x)
-        y_flat = tf.nest.flatten(y)
-        reduced = list(map(self, zip(x_flat, y_flat)))
-        return tf.nest.pack_sequence_as(x, reduced)
-
-    def call(self, inputs, sequence_length=None):  # pylint: disable=arguments-differ
-        """Reduces all input elements.
-        Args:
-          inputs: A list of ``tf.Tensor``.
-          sequence_length: The length of each input, if reducing sequences.
-        Returns:
-          If :obj:`sequence_length` is set, a tuple
-          ``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor``
-          only.
-        """
-        if sequence_length is None:
-            return self.reduce(inputs)
-        else:
-            return self.reduce_sequence(
-                inputs, sequence_lengths=sequence_length)
-
-    @abc.abstractmethod
-    def reduce(self, inputs):
-        """See :meth:`opennmt.layers.Reducer.__call__`."""
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def reduce_sequence(self, inputs, sequence_lengths):
-        """See :meth:`opennmt.layers.Reducer.__call__`."""
-        raise NotImplementedError()
-
-
-class SumReducer(Reducer):
-    """A reducer that sums the inputs."""
-
-    def reduce(self, inputs):
-        if len(inputs) == 1:
-            return inputs[0]
-        if len(inputs) == 2:
-            return inputs[0] + inputs[1]
-        return tf.add_n(inputs)
-
-    def reduce_sequence(self, inputs, sequence_lengths):
-        padded, combined_length = pad_n_with_identity(
-            inputs, sequence_lengths, identity_values=0)
-        return self.reduce(padded), combined_length
-
-
-class MultiplyReducer(Reducer):
-    """A reducer that multiplies the inputs."""
-
-    def reduce(self, inputs):
-        return functools.reduce(lambda a, x: a * x, inputs)
-
-    def reduce_sequence(self, inputs, sequence_lengths):
-        padded, combined_length = pad_n_with_identity(
-            inputs, sequence_lengths, identity_values=1)
-        return self.reduce(padded), combined_length
diff --git a/modelscope/models/audio/tts/models/rnn_wrappers.py b/modelscope/models/audio/tts/models/rnn_wrappers.py
deleted file mode 100755
index 6c487bab..00000000
--- a/modelscope/models/audio/tts/models/rnn_wrappers.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.ops import rnn_cell_impl
-
-from .am_models import prenet
-
-
-class VarPredictorCell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(VarPredictorCell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='var_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class DurPredictorCell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(DurPredictorCell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='dur_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-        new_super_cell_out = tf.nn.relu(new_super_cell_out)
-        #    new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1)
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class DurPredictorCECell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units,
-                 max_dur, dur_embedding_dim):
-        super(DurPredictorCECell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-        self._max_dur = max_dur
-        self._dur_embedding_dim = dur_embedding_dim
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._max_dur
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = tf.squeeze(
-            tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1)  # [N]
-        prenet_input = tf.one_hot(
-            prenet_input, self._max_dur, on_value=1.0, off_value=0.0,
-            axis=-1)  # [N, 120]
-        prenet_input = tf.layers.dense(
-            prenet_input, units=self._dur_embedding_dim)
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='dur_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._max_dur)  # [N, 120]
-        new_super_cell_out = tf.nn.softmax(new_super_cell_out)  # [N, 120]
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class VarPredictorCell2(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(VarPredictorCell2, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        '''Run the Tacotron2 super decoder cell.'''
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='var_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-
-        # split and relu
-        new_super_cell_out = tf.concat([
-            tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:]
-        ], axis=-1)  # yapf:disable
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
diff --git a/modelscope/models/audio/tts/models/robutrans.py b/modelscope/models/audio/tts/models/robutrans.py
deleted file mode 100755
index ab9fdfcc..00000000
--- a/modelscope/models/audio/tts/models/robutrans.py
+++ /dev/null
@@ -1,760 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.ops.ragged.ragged_util import repeat
-
-from .fsmn_encoder import FsmnEncoderV2
-from .position import BatchSinusodalPositionalEncoding
-from .self_attention_decoder import SelfAttentionDecoder
-from .self_attention_encoder import SelfAttentionEncoder
-
-
-class RobuTrans():
-
-    def __init__(self, hparams):
-        self._hparams = hparams
-
-    def initialize(self,
-                   inputs,
-                   inputs_emotion,
-                   inputs_speaker,
-                   input_lengths,
-                   output_lengths=None,
-                   mel_targets=None,
-                   durations=None,
-                   pitch_contours=None,
-                   uv_masks=None,
-                   pitch_scales=None,
-                   duration_scales=None,
-                   energy_contours=None,
-                   energy_scales=None):
-        """Initializes the model for inference.
-
-        Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields.
-
-        Args:
-          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
-            steps in the input time series, and values are character IDs
-          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
-            of each sequence in inputs.
-          output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
-            of each sequence in outputs.
-          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
-            of steps in the output time series, M is num_mels, and values are entries in the mel
-            spectrogram. Only needed for training.
-        """
-        from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell
-        from tensorflow.contrib.seq2seq import BasicDecoder
-
-        with tf.variable_scope('inference') as _:
-            is_training = mel_targets is not None
-            batch_size = tf.shape(inputs)[0]
-            hp = self._hparams
-
-            input_mask = None
-            if input_lengths is not None and is_training:
-                input_mask = tf.sequence_mask(
-                    input_lengths, tf.shape(inputs)[1], dtype=tf.float32)
-
-            if input_mask is not None:
-                inputs = inputs * tf.expand_dims(input_mask, -1)
-
-            # speaker embedding
-            embedded_inputs_speaker = tf.layers.dense(
-                inputs_speaker,
-                32,
-                activation=None,
-                use_bias=False,
-                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
-
-            # emotion embedding
-            embedded_inputs_emotion = tf.layers.dense(
-                inputs_emotion,
-                32,
-                activation=None,
-                use_bias=False,
-                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
-
-            # symbol embedding
-            with tf.variable_scope('Embedding'):
-                embedded_inputs = tf.layers.dense(
-                    inputs,
-                    hp.embedding_dim,
-                    activation=None,
-                    use_bias=False,
-                    kernel_initializer=tf.truncated_normal_initializer(
-                        stddev=0.5))
-
-            # Encoder
-            with tf.variable_scope('Encoder'):
-                Encoder = SelfAttentionEncoder(
-                    num_layers=hp.encoder_num_layers,
-                    num_units=hp.encoder_num_units,
-                    num_heads=hp.encoder_num_heads,
-                    ffn_inner_dim=hp.encoder_ffn_inner_dim,
-                    dropout=hp.encoder_dropout,
-                    attention_dropout=hp.encoder_attention_dropout,
-                    relu_dropout=hp.encoder_relu_dropout)
-                encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode(
-                    embedded_inputs,
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                encoder_outputs = tf.layers.dense(
-                    encoder_outputs,
-                    hp.encoder_projection_units,
-                    activation=None,
-                    use_bias=False,
-                    kernel_initializer=tf.truncated_normal_initializer(
-                        stddev=0.5))
-
-            # pitch and energy
-            var_inputs = tf.concat([
-                encoder_outputs, embedded_inputs_speaker,
-                embedded_inputs_emotion
-            ], 2)
-            if input_mask is not None:
-                var_inputs = var_inputs * tf.expand_dims(input_mask, -1)
-
-            with tf.variable_scope('Pitch_Predictor'):
-                Pitch_Predictor_FSMN = FsmnEncoderV2(
-                    filter_size=hp.predictor_filter_size,
-                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
-                    dnn_num_layers=hp.predictor_dnn_num_layers,
-                    num_memory_units=hp.predictor_num_memory_units,
-                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
-                    dropout=hp.predictor_dropout,
-                    shift=hp.predictor_shift,
-                    position_encoder=None)
-                pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode(
-                    tf.concat([
-                        encoder_outputs, embedded_inputs_speaker,
-                        embedded_inputs_emotion
-                    ], 2),
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    pitch_contour_outputs,
-                    sequence_length=input_lengths,
-                    dtype=tf.float32)
-                pitch_contour_outputs = tf.concat(
-                    pitch_contour_outputs, axis=-1)
-                pitch_contour_outputs = tf.layers.dense(
-                    pitch_contour_outputs, units=1)  # [N, T_in, 1]
-                pitch_contour_outputs = tf.squeeze(
-                    pitch_contour_outputs, axis=2)  # [N, T_in]
-
-            with tf.variable_scope('Energy_Predictor'):
-                Energy_Predictor_FSMN = FsmnEncoderV2(
-                    filter_size=hp.predictor_filter_size,
-                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
-                    dnn_num_layers=hp.predictor_dnn_num_layers,
-                    num_memory_units=hp.predictor_num_memory_units,
-                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
-                    dropout=hp.predictor_dropout,
-                    shift=hp.predictor_shift,
-                    position_encoder=None)
-                energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode(
-                    tf.concat([
-                        encoder_outputs, embedded_inputs_speaker,
-                        embedded_inputs_emotion
-                    ], 2),
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    energy_contour_outputs,
-                    sequence_length=input_lengths,
-                    dtype=tf.float32)
-                energy_contour_outputs = tf.concat(
-                    energy_contour_outputs, axis=-1)
-                energy_contour_outputs = tf.layers.dense(
-                    energy_contour_outputs, units=1)  # [N, T_in, 1]
-                energy_contour_outputs = tf.squeeze(
-                    energy_contour_outputs, axis=2)  # [N, T_in]
-
-            if is_training:
-                pitch_embeddings = tf.expand_dims(
-                    pitch_contours, axis=2)  # [N, T_in, 1]
-                pitch_embeddings = tf.layers.conv1d(
-                    pitch_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='pitch_embeddings')  # [N, T_in, 32]
-
-                energy_embeddings = tf.expand_dims(
-                    energy_contours, axis=2)  # [N, T_in, 1]
-                energy_embeddings = tf.layers.conv1d(
-                    energy_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='energy_embeddings')  # [N, T_in, 32]
-            else:
-                pitch_contour_outputs *= pitch_scales
-                pitch_embeddings = tf.expand_dims(
-                    pitch_contour_outputs, axis=2)  # [N, T_in, 1]
-                pitch_embeddings = tf.layers.conv1d(
-                    pitch_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='pitch_embeddings')  # [N, T_in, 32]
-
-                energy_contour_outputs *= energy_scales
-                energy_embeddings = tf.expand_dims(
-                    energy_contour_outputs, axis=2)  # [N, T_in, 1]
-                energy_embeddings = tf.layers.conv1d(
-                    energy_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='energy_embeddings')  # [N, T_in, 32]
-
-            encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings
-
-            # duration
-            dur_inputs = tf.concat([
-                encoder_outputs_, embedded_inputs_speaker,
-                embedded_inputs_emotion
-            ], 2)
-            if input_mask is not None:
-                dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1)
-            with tf.variable_scope('Duration_Predictor'):
-                duration_predictor_cell = MultiRNNCell([
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units)
-                ], state_is_tuple=True)  # yapf:disable
-                from .rnn_wrappers import DurPredictorCell
-                duration_output_cell = DurPredictorCell(
-                    duration_predictor_cell, is_training, 1,
-                    hp.predictor_prenet_units)
-                duration_predictor_init_state = duration_output_cell.zero_state(
-                    batch_size=batch_size, dtype=tf.float32)
-                if is_training:
-                    from .helpers import VarTrainingHelper
-                    duration_helper = VarTrainingHelper(
-                        tf.expand_dims(
-                            tf.log(tf.cast(durations, tf.float32) + 1),
-                            axis=2), dur_inputs, 1)
-                else:
-                    from .helpers import VarTestHelper
-                    duration_helper = VarTestHelper(batch_size, dur_inputs, 1)
-                (
-                    duration_outputs, _
-                ), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode(
-                    BasicDecoder(duration_output_cell, duration_helper,
-                                 duration_predictor_init_state),
-                    maximum_iterations=1000)
-                duration_outputs = tf.squeeze(
-                    duration_outputs, axis=2)  # [N, T_in]
-                if input_mask is not None:
-                    duration_outputs = duration_outputs * input_mask
-                duration_outputs_ = tf.exp(duration_outputs) - 1
-
-            # Length Regulator
-            with tf.variable_scope('Length_Regulator'):
-                if is_training:
-                    i = tf.constant(1)
-                    # position embedding
-                    j = tf.constant(1)
-                    dur_len = tf.shape(durations)[-1]
-                    embedded_position_i = tf.range(1, durations[0, 0] + 1)
-
-                    def condition_pos(j, e):
-                        return tf.less(j, dur_len)
-
-                    def loop_body_pos(j, embedded_position_i):
-                        embedded_position_i = tf.concat([
-                            embedded_position_i,
-                            tf.range(1, durations[0, j] + 1)
-                        ], axis=0)  # yapf:disable
-                        return [j + 1, embedded_position_i]
-
-                    j, embedded_position_i = tf.while_loop(
-                        condition_pos,
-                        loop_body_pos, [j, embedded_position_i],
-                        shape_invariants=[
-                            j.get_shape(),
-                            tf.TensorShape([None])
-                        ])
-                    embedded_position = tf.reshape(embedded_position_i,
-                                                   (1, -1))
-
-                    # others
-                    LR_outputs = repeat(
-                        encoder_outputs_[0:1, :, :], durations[0, :], axis=1)
-                    embedded_outputs_speaker = repeat(
-                        embedded_inputs_speaker[0:1, :, :],
-                        durations[0, :],
-                        axis=1)
-                    embedded_outputs_emotion = repeat(
-                        embedded_inputs_emotion[0:1, :, :],
-                        durations[0, :],
-                        axis=1)
-
-                    def condition(i, pos, layer, s, e):
-                        return tf.less(i, tf.shape(mel_targets)[0])
-
-                    def loop_body(i, embedded_position, LR_outputs,
-                                  embedded_outputs_speaker,
-                                  embedded_outputs_emotion):
-                        # position embedding
-                        jj = tf.constant(1)
-                        embedded_position_i = tf.range(1, durations[i, 0] + 1)
-
-                        def condition_pos_i(j, e):
-                            return tf.less(j, dur_len)
-
-                        def loop_body_pos_i(j, embedded_position_i):
-                            embedded_position_i = tf.concat([
-                                embedded_position_i,
-                                tf.range(1, durations[i, j] + 1)
-                            ], axis=0)  # yapf:disable
-                            return [j + 1, embedded_position_i]
-
-                        jj, embedded_position_i = tf.while_loop(
-                            condition_pos_i,
-                            loop_body_pos_i, [jj, embedded_position_i],
-                            shape_invariants=[
-                                jj.get_shape(),
-                                tf.TensorShape([None])
-                            ])
-                        embedded_position = tf.concat([
-                            embedded_position,
-                            tf.reshape(embedded_position_i, (1, -1))
-                        ], 0)
-
-                        # others
-                        LR_outputs = tf.concat([
-                            LR_outputs,
-                            repeat(
-                                encoder_outputs_[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        embedded_outputs_speaker = tf.concat([
-                            embedded_outputs_speaker,
-                            repeat(
-                                embedded_inputs_speaker[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        embedded_outputs_emotion = tf.concat([
-                            embedded_outputs_emotion,
-                            repeat(
-                                embedded_inputs_emotion[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        return [
-                            i + 1, embedded_position, LR_outputs,
-                            embedded_outputs_speaker, embedded_outputs_emotion
-                        ]
-
-                    i, embedded_position, LR_outputs,
-                    embedded_outputs_speaker,
-                    embedded_outputs_emotion = tf.while_loop(
-                        condition,
-                        loop_body, [
-                            i, embedded_position, LR_outputs,
-                            embedded_outputs_speaker, embedded_outputs_emotion
-                        ],
-                        shape_invariants=[
-                            i.get_shape(),
-                            tf.TensorShape([None, None]),
-                            tf.TensorShape([None, None, None]),
-                            tf.TensorShape([None, None, None]),
-                            tf.TensorShape([None, None, None])
-                        ],
-                        parallel_iterations=hp.batch_size)
-
-                    ori_framenum = tf.shape(mel_targets)[1]
-                else:
-                    # position
-                    j = tf.constant(1)
-                    dur_len = tf.shape(duration_outputs_)[-1]
-                    embedded_position_i = tf.range(
-                        1,
-                        tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32)
-                        + 1)
-
-                    def condition_pos(j, e):
-                        return tf.less(j, dur_len)
-
-                    def loop_body_pos(j, embedded_position_i):
-                        embedded_position_i = tf.concat([
-                            embedded_position_i,
-                            tf.range(
-                                1,
-                                tf.cast(
-                                    tf.round(duration_outputs_)[0, j],
-                                    tf.int32) + 1)
-                        ], axis=0)  # yapf:disable
-                        return [j + 1, embedded_position_i]
-
-                    j, embedded_position_i = tf.while_loop(
-                        condition_pos,
-                        loop_body_pos, [j, embedded_position_i],
-                        shape_invariants=[
-                            j.get_shape(),
-                            tf.TensorShape([None])
-                        ])
-                    embedded_position = tf.reshape(embedded_position_i,
-                                                   (1, -1))
-                    # others
-                    duration_outputs_ *= duration_scales
-                    LR_outputs = repeat(
-                        encoder_outputs_[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    embedded_outputs_speaker = repeat(
-                        embedded_inputs_speaker[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    embedded_outputs_emotion = repeat(
-                        embedded_inputs_emotion[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    ori_framenum = tf.shape(LR_outputs)[1]
-
-                    left = hp.outputs_per_step - tf.mod(
-                        ori_framenum, hp.outputs_per_step)
-                    LR_outputs = tf.cond(
-                        tf.equal(left,
-                                 hp.outputs_per_step), lambda: LR_outputs,
-                        lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]],
-                                       'CONSTANT'))
-                    embedded_outputs_speaker = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_outputs_speaker, lambda: tf.pad(
-                            embedded_outputs_speaker, [[0, 0], [0, left],
-                                                       [0, 0]], 'CONSTANT'))
-                    embedded_outputs_emotion = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_outputs_emotion, lambda: tf.pad(
-                            embedded_outputs_emotion, [[0, 0], [0, left],
-                                                       [0, 0]], 'CONSTANT'))
-                    embedded_position = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_position,
-                        lambda: tf.pad(embedded_position, [[0, 0], [0, left]],
-                                       'CONSTANT'))
-
-            # Pos_Embedding
-            with tf.variable_scope('Position_Embedding'):
-                Pos_Embedding = BatchSinusodalPositionalEncoding()
-                position_embeddings = Pos_Embedding.positional_encoding(
-                    batch_size,
-                    tf.shape(LR_outputs)[1], hp.encoder_projection_units,
-                    embedded_position)
-            LR_outputs += position_embeddings
-
-            # multi-frame
-            LR_outputs = tf.reshape(LR_outputs, [
-                batch_size, -1,
-                hp.outputs_per_step * hp.encoder_projection_units
-            ])
-            embedded_outputs_speaker = tf.reshape(
-                embedded_outputs_speaker,
-                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
-            embedded_outputs_emotion = tf.reshape(
-                embedded_outputs_emotion,
-                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
-            # [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64)
-            LR_outputs = tf.concat([
-                LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion
-            ], -1)
-
-            # auto bandwidth
-            if is_training:
-                durations_mask = tf.cast(durations,
-                                         tf.float32) * input_mask  # [N, T_in]
-            else:
-                durations_mask = duration_outputs_
-            X_band_width = tf.cast(
-                tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step),
-                tf.int32)
-            H_band_width = X_band_width
-
-            with tf.variable_scope('Decoder'):
-                Decoder = SelfAttentionDecoder(
-                    num_layers=hp.decoder_num_layers,
-                    num_units=hp.decoder_num_units,
-                    num_heads=hp.decoder_num_heads,
-                    ffn_inner_dim=hp.decoder_ffn_inner_dim,
-                    dropout=hp.decoder_dropout,
-                    attention_dropout=hp.decoder_attention_dropout,
-                    relu_dropout=hp.decoder_relu_dropout,
-                    prenet_units=hp.prenet_units,
-                    dense_units=hp.prenet_proj_units,
-                    num_mels=hp.num_mels,
-                    outputs_per_step=hp.outputs_per_step,
-                    X_band_width=X_band_width,
-                    H_band_width=H_band_width,
-                    position_encoder=None)
-                if is_training:
-                    if hp.free_run:
-                        r = hp.outputs_per_step
-                        init_decoder_input = tf.expand_dims(
-                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                            axis=1)  # [N, 1, hp.num_mels]
-                        decoder_input_lengths = tf.cast(
-                            output_lengths / r, tf.int32)
-                        decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
-                            init_decoder_input,
-                            maximum_iterations=tf.shape(LR_outputs)[1],
-                            mode=is_training,
-                            memory=LR_outputs,
-                            memory_sequence_length=decoder_input_lengths)
-                    else:
-                        r = hp.outputs_per_step
-                        decoder_input = mel_targets[:, r - 1::
-                                                    r, :]  # [N, T_out / r, hp.num_mels]
-                        init_decoder_input = tf.expand_dims(
-                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                            axis=1)  # [N, 1, hp.num_mels]
-                        decoder_input = tf.concat(
-                            [init_decoder_input, decoder_input],
-                            axis=1)  # [N, T_out / r + 1, hp.num_mels]
-                        decoder_input = decoder_input[:, :
-                                                      -1, :]  # [N, T_out / r, hp.num_mels]
-                        decoder_input_lengths = tf.cast(
-                            output_lengths / r, tf.int32)
-                        decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs(
-                            decoder_input,
-                            decoder_input_lengths,
-                            mode=is_training,
-                            memory=LR_outputs,
-                            memory_sequence_length=decoder_input_lengths)
-                else:
-                    init_decoder_input = tf.expand_dims(
-                        tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                        axis=1)  # [N, 1, hp.num_mels]
-                    decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
-                        init_decoder_input,
-                        maximum_iterations=tf.shape(LR_outputs)[1],
-                        mode=is_training,
-                        memory=LR_outputs,
-                        memory_sequence_length=tf.expand_dims(
-                            tf.shape(LR_outputs)[1], axis=0))
-
-                if is_training:
-                    mel_outputs_ = tf.reshape(decoder_outputs,
-                                              [batch_size, -1, hp.num_mels])
-                else:
-                    mel_outputs_ = tf.reshape(
-                        decoder_outputs,
-                        [batch_size, -1, hp.num_mels])[:, :ori_framenum, :]
-                mel_outputs = mel_outputs_
-
-            with tf.variable_scope('Postnet'):
-                Postnet_FSMN = FsmnEncoderV2(
-                    filter_size=hp.postnet_filter_size,
-                    fsmn_num_layers=hp.postnet_fsmn_num_layers,
-                    dnn_num_layers=hp.postnet_dnn_num_layers,
-                    num_memory_units=hp.postnet_num_memory_units,
-                    ffn_inner_dim=hp.postnet_ffn_inner_dim,
-                    dropout=hp.postnet_dropout,
-                    shift=hp.postnet_shift,
-                    position_encoder=None)
-                if is_training:
-                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
-                        mel_outputs,
-                        sequence_length=output_lengths,
-                        mode=is_training)
-                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
-                        LSTMBlockCell(hp.postnet_lstm_units),
-                        postnet_fsmn_outputs,
-                        sequence_length=output_lengths,
-                        dtype=tf.float32)
-                else:
-                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
-                        mel_outputs,
-                        sequence_length=[tf.shape(mel_outputs_)[1]],
-                        mode=is_training)
-                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
-                        LSTMBlockCell(hp.postnet_lstm_units),
-                        postnet_fsmn_outputs,
-                        sequence_length=[tf.shape(mel_outputs_)[1]],
-                        dtype=tf.float32)
-
-            mel_residual_outputs = tf.layers.dense(
-                hidden_lstm_outputs, units=hp.num_mels)
-            mel_outputs += mel_residual_outputs
-
-            self.inputs = inputs
-            self.inputs_speaker = inputs_speaker
-            self.inputs_emotion = inputs_emotion
-            self.input_lengths = input_lengths
-            self.durations = durations
-            self.output_lengths = output_lengths
-            self.mel_outputs_ = mel_outputs_
-            self.mel_outputs = mel_outputs
-            self.mel_targets = mel_targets
-            self.duration_outputs = duration_outputs
-            self.duration_outputs_ = duration_outputs_
-            self.duration_scales = duration_scales
-            self.pitch_contour_outputs = pitch_contour_outputs
-            self.pitch_contours = pitch_contours
-            self.pitch_scales = pitch_scales
-            self.energy_contour_outputs = energy_contour_outputs
-            self.energy_contours = energy_contours
-            self.energy_scales = energy_scales
-            self.uv_masks_ = uv_masks
-
-            self.embedded_inputs_emotion = embedded_inputs_emotion
-            self.embedding_fsmn_outputs = embedded_inputs
-            self.encoder_outputs = encoder_outputs
-            self.encoder_outputs_ = encoder_outputs_
-            self.LR_outputs = LR_outputs
-            self.postnet_fsmn_outputs = postnet_fsmn_outputs
-
-            self.pitch_embeddings = pitch_embeddings
-            self.energy_embeddings = energy_embeddings
-
-            self.attns = attns
-            self.attention_x = attention_x
-            self.attention_h = attention_h
-            self.X_band_width = X_band_width
-            self.H_band_width = H_band_width
-
-    def add_loss(self):
-        '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
-        with tf.variable_scope('loss') as _:
-            hp = self._hparams
-            mask = tf.sequence_mask(
-                self.output_lengths,
-                tf.shape(self.mel_targets)[1],
-                dtype=tf.float32)
-            valid_outputs = tf.reduce_sum(mask)
-
-            mask_input = tf.sequence_mask(
-                self.input_lengths,
-                tf.shape(self.durations)[1],
-                dtype=tf.float32)
-            valid_inputs = tf.reduce_sum(mask_input)
-
-            # mel loss
-            if self.uv_masks_ is not None:
-                valid_outputs_mask = tf.reduce_sum(
-                    tf.expand_dims(mask, -1) * self.uv_masks_)
-                self.mel_loss_ = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs_)
-                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
-                        valid_outputs_mask * hp.num_mels)
-                self.mel_loss = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs)
-                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
-                        valid_outputs_mask * hp.num_mels)
-            else:
-                self.mel_loss_ = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs_)
-                    * tf.expand_dims(mask, -1)) / (
-                        valid_outputs * hp.num_mels)
-                self.mel_loss = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs)
-                    * tf.expand_dims(mask, -1)) / (
-                        valid_outputs * hp.num_mels)
-
-            # duration loss
-            self.duration_loss = tf.reduce_sum(
-                tf.abs(
-                    tf.log(tf.cast(self.durations, tf.float32) + 1)
-                    - self.duration_outputs) * mask_input) / valid_inputs
-
-            # pitch contour loss
-            self.pitch_contour_loss = tf.reduce_sum(
-                tf.abs(self.pitch_contours - self.pitch_contour_outputs)
-                * mask_input) / valid_inputs
-
-            # energy contour loss
-            self.energy_contour_loss = tf.reduce_sum(
-                tf.abs(self.energy_contours - self.energy_contour_outputs)
-                * mask_input) / valid_inputs
-
-            # final loss
-            self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \
-                + self.pitch_contour_loss + self.energy_contour_loss
-
-            # guided attention loss
-            self.guided_attention_loss = tf.constant(0.0)
-            if hp.guided_attention:
-                i0 = tf.constant(0)
-                loss0 = tf.constant(0.0)
-
-                def c(i, _):
-                    return tf.less(i, tf.shape(mel_targets)[0])
-
-                def loop_body(i, loss):
-                    decoder_input_lengths = tf.cast(
-                        self.output_lengths / hp.outputs_per_step, tf.int32)
-                    input_len = decoder_input_lengths[i]
-                    output_len = decoder_input_lengths[i]
-                    input_w = tf.expand_dims(
-                        tf.range(tf.cast(input_len, dtype=tf.float32)),
-                        axis=1) / tf.cast(
-                            input_len, dtype=tf.float32)  # [T_in, 1]
-                    output_w = tf.expand_dims(
-                        tf.range(tf.cast(output_len, dtype=tf.float32)),
-                        axis=0) / tf.cast(
-                            output_len, dtype=tf.float32)  # [1, T_out]
-                    guided_attention_w = 1.0 - tf.exp(
-                        -(1 / hp.guided_attention_2g_squared)
-                        * tf.square(input_w - output_w))  # [T_in, T_out]
-                    guided_attention_w = tf.expand_dims(
-                        guided_attention_w, axis=0)  # [1, T_in, T_out]
-                    # [hp.decoder_num_heads, T_in, T_out]
-                    guided_attention_w = tf.tile(guided_attention_w,
-                                                 [hp.decoder_num_heads, 1, 1])
-                    loss_i = tf.constant(0.0)
-                    for j in range(hp.decoder_num_layers):
-                        loss_i += tf.reduce_mean(
-                            self.attention_h[j][i, :, :input_len, :output_len]
-                            * guided_attention_w)
-
-                    return [tf.add(i, 1), tf.add(loss, loss_i)]
-
-                _, loss = tf.while_loop(
-                    c,
-                    loop_body,
-                    loop_vars=[i0, loss0],
-                    parallel_iterations=hp.batch_size)
-                self.guided_attention_loss = loss / hp.batch_size
-                self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss
-
-    def add_optimizer(self, global_step):
-        '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
-
-        Args:
-          global_step: int32 scalar Tensor representing current global step in training
-        '''
-        with tf.variable_scope('optimizer') as _:
-            hp = self._hparams
-            if hp.decay_learning_rate:
-                self.learning_rate = _learning_rate_decay(
-                    hp.initial_learning_rate, global_step)
-            else:
-                self.learning_rate = tf.convert_to_tensor(
-                    hp.initial_learning_rate)
-            optimizer = tf.train.AdamOptimizer(self.learning_rate,
-                                               hp.adam_beta1, hp.adam_beta2)
-            gradients, variables = zip(*optimizer.compute_gradients(self.loss))
-            self.gradients = gradients
-            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
-
-            # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
-            # https://github.com/tensorflow/tensorflow/issues/1122
-            with tf.control_dependencies(
-                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
-                self.optimize = optimizer.apply_gradients(
-                    zip(clipped_gradients, variables), global_step=global_step)
-
-
-def _learning_rate_decay(init_lr, global_step):
-    # Noam scheme from tensor2tensor:
-    warmup_steps = 4000.0
-    step = tf.cast(global_step + 1, dtype=tf.float32)
-    return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5,
-                                                    step**-0.5)
diff --git a/modelscope/models/audio/tts/models/self_attention_decoder.py b/modelscope/models/audio/tts/models/self_attention_decoder.py
deleted file mode 100755
index 9cf3fcaa..00000000
--- a/modelscope/models/audio/tts/models/self_attention_decoder.py
+++ /dev/null
@@ -1,817 +0,0 @@
-"""Define self-attention decoder."""
-
-import sys
-
-import tensorflow as tf
-
-from . import compat, transformer
-from .am_models import decoder_prenet
-from .position import SinusoidalPositionEncoder
-
-
-class SelfAttentionDecoder():
-    """Decoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 prenet_units=256,
-                 dense_units=128,
-                 num_mels=80,
-                 outputs_per_step=3,
-                 X_band_width=None,
-                 H_band_width=None,
-                 position_encoder=SinusoidalPositionEncoder(),
-                 self_attention_type='scaled_dot'):
-        """Initializes the parameters of the decoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
-            insensitive).
-
-        Raises:
-          ValueError: if :obj:`self_attention_type` is invalid.
-        """
-        super(SelfAttentionDecoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-        self.self_attention_type = self_attention_type.lower()
-        if self.self_attention_type not in ('scaled_dot', 'average'):
-            raise ValueError('invalid attention type %s'
-                             % self.self_attention_type)
-        if self.self_attention_type == 'average':
-            tf.logging.warning(
-                'Support for average attention network is experimental '
-                'and may change in future versions.')
-        self.prenet_units = prenet_units
-        self.dense_units = dense_units
-        self.num_mels = num_mels
-        self.outputs_per_step = outputs_per_step
-        self.X_band_width = X_band_width
-        self.H_band_width = H_band_width
-
-    @property
-    def output_size(self):
-        """Returns the decoder output size."""
-        return self.num_units
-
-    @property
-    def support_alignment_history(self):
-        return True
-
-    @property
-    def support_multi_source(self):
-        return True
-
-    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
-        cache = {}
-
-        for layer in range(self.num_layers):
-            proj_cache_shape = [
-                batch_size, self.num_heads, 0, self.num_units // self.num_heads
-            ]
-            layer_cache = {}
-            layer_cache['memory'] = [{
-                'memory_keys':
-                tf.zeros(proj_cache_shape, dtype=dtype),
-                'memory_values':
-                tf.zeros(proj_cache_shape, dtype=dtype)
-            } for _ in range(num_sources)]
-            if self.self_attention_type == 'scaled_dot':
-                layer_cache['self_keys'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-                layer_cache['self_values'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-            elif self.self_attention_type == 'average':
-                layer_cache['prev_g'] = tf.zeros(
-                    [batch_size, 1, self.num_units], dtype=dtype)
-            cache['layer_{}'.format(layer)] = layer_cache
-
-        return cache
-
-    def _init_attn(self, dtype=tf.float32):
-        attn = []
-        for layer in range(self.num_layers):
-            attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True))
-        return attn
-
-    def _self_attention_stack(self,
-                              inputs,
-                              sequence_length=None,
-                              mode=True,
-                              cache=None,
-                              memory=None,
-                              memory_sequence_length=None,
-                              step=None):
-
-        # [N, T_out, self.dense_units] or [N, 1, self.dense_units]
-        prenet_outputs = decoder_prenet(inputs, self.prenet_units,
-                                        self.dense_units, mode)
-        if step is None:
-            decoder_inputs = tf.concat(
-                [memory, prenet_outputs],
-                axis=-1)  # [N, T_out, memory_size + self.dense_units]
-        else:
-            decoder_inputs = tf.concat(
-                [memory[:, step:step + 1, :], prenet_outputs],
-                axis=-1)  # [N, 1, memory_size + self.dense_units]
-        decoder_inputs = tf.layers.dense(
-            decoder_inputs, units=self.dense_units)
-
-        inputs = decoder_inputs
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(
-                inputs, position=step + 1 if step is not None else None)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        decoder_mask = None
-        memory_mask = None
-        # last_attention = None
-
-        X_band_width_tmp = -1
-        H_band_width_tmp = -1
-        if self.X_band_width is not None:
-            X_band_width_tmp = tf.cast(
-                tf.cond(
-                    tf.less(tf.shape(memory)[1], self.X_band_width),
-                    lambda: -1, lambda: self.X_band_width),
-                dtype=tf.int64)
-        if self.H_band_width is not None:
-            H_band_width_tmp = tf.cast(
-                tf.cond(
-                    tf.less(tf.shape(memory)[1], self.H_band_width),
-                    lambda: -1, lambda: self.H_band_width),
-                dtype=tf.int64)
-
-        if self.self_attention_type == 'scaled_dot':
-            if sequence_length is not None:
-                decoder_mask = transformer.build_future_mask(
-                    sequence_length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(inputs)[1],
-                    band=X_band_width_tmp)  # [N, 1, T_out, T_out]
-        elif self.self_attention_type == 'average':
-            if cache is None:
-                if sequence_length is None:
-                    sequence_length = tf.fill([tf.shape(inputs)[0]],
-                                              tf.shape(inputs)[1])
-                decoder_mask = transformer.cumulative_average_mask(
-                    sequence_length,
-                    maximum_length=tf.shape(inputs)[1],
-                    dtype=inputs.dtype)
-
-        if memory is not None and not tf.contrib.framework.nest.is_sequence(
-                memory):
-            memory = (memory, )
-        if memory_sequence_length is not None:
-            if not tf.contrib.framework.nest.is_sequence(
-                    memory_sequence_length):
-                memory_sequence_length = (memory_sequence_length, )
-            if step is None:
-                memory_mask = [
-                    transformer.build_history_mask(
-                        length,
-                        num_heads=self.num_heads,
-                        maximum_length=tf.shape(m)[1],
-                        band=H_band_width_tmp)
-                    for m, length in zip(memory, memory_sequence_length)
-                ]
-            else:
-                memory_mask = [
-                    transformer.build_history_mask(
-                        length,
-                        num_heads=self.num_heads,
-                        maximum_length=tf.shape(m)[1],
-                        band=H_band_width_tmp)[:, :, step:step + 1, :]
-                    for m, length in zip(memory, memory_sequence_length)
-                ]
-
-        # last_attention = None
-        attns_x = []
-        attns_h = []
-        for layer in range(self.num_layers):
-            layer_name = 'layer_{}'.format(layer)
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.variable_scope(layer_name):
-                if memory is not None:
-                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
-                        memory_cache = None
-                        if layer_cache is not None:
-                            memory_cache = layer_cache['memory'][i]
-                        scope_name = 'multi_head_{}'.format(i)
-                        if i == 0:
-                            scope_name = 'multi_head'
-                        with tf.variable_scope(scope_name):
-                            encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA(
-                                self.num_heads,
-                                transformer.norm(inputs),
-                                mem,
-                                mode,
-                                num_units=self.num_units,
-                                mask=decoder_mask,
-                                mask_h=mask,
-                                cache=layer_cache,
-                                cache_h=memory_cache,
-                                dropout=self.attention_dropout,
-                                return_attention=True,
-                                layer_name=layer_name,
-                                X_band_width=self.X_band_width)
-                            attns_x.append(attn_x)
-                            attns_h.append(attn_h)
-                            context = transformer.drop_and_add(
-                                inputs, encoded, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-
-        outputs = transformer.norm(inputs)
-        outputs = tf.layers.dense(
-            outputs, units=self.num_mels * self.outputs_per_step)
-        return outputs, attns_x, attns_h
-
-    def decode_from_inputs(self,
-                           inputs,
-                           sequence_length,
-                           initial_state=None,
-                           mode=True,
-                           memory=None,
-                           memory_sequence_length=None):
-        outputs, attention_x, attention_h = self._self_attention_stack(
-            inputs,
-            sequence_length=sequence_length,
-            mode=mode,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-        return outputs, attention_x, attention_h
-
-    def step_fn(self,
-                mode,
-                batch_size,
-                initial_state=None,
-                memory=None,
-                memory_sequence_length=None,
-                dtype=tf.float32):
-        if memory is None:
-            num_sources = 0
-        elif tf.contrib.framework.nest.is_sequence(memory):
-            num_sources = len(memory)
-        else:
-            num_sources = 1
-        cache = self._init_cache(
-            batch_size, dtype=dtype, num_sources=num_sources)
-        attention_x = self._init_attn(dtype=dtype)
-        attention_h = self._init_attn(dtype=dtype)
-
-        def _fn(step, inputs, cache):
-            outputs, attention_x, attention_h = self._self_attention_stack(
-                inputs,
-                mode=mode,
-                cache=cache,
-                memory=memory,
-                memory_sequence_length=memory_sequence_length,
-                step=step)
-            attention_x_tmp = []
-            for layer in range(len(attention_h)):
-                attention_x_tmp_l = tf.zeros_like(attention_h[layer])
-                if self.X_band_width is not None:
-                    pred = tf.less(step, self.X_band_width + 1)
-                    attention_x_tmp_l_1 = tf.cond(pred,  # yapf:disable
-                                                  lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer],
-                                                  lambda: tf.concat([
-                                                                    attention_x_tmp_l[:, :, :,
-                                                                                      :step - self.X_band_width],
-                                                                    attention_x_tmp_l[:, :, :,
-                                                                                      step - self.X_band_width:step + 1]
-                                                                    + attention_x[layer]],
-                                                                    axis=-1))  # yapf:disable
-                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
-                    attention_x_tmp.append(
-                        tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2],
-                                  axis=-1))
-                else:
-                    attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1]
-                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
-                    attention_x_tmp.append(
-                        tf.concat([
-                            attention_x_tmp_l_1 + attention_x[layer],
-                            attention_x_tmp_l_2
-                        ], axis=-1))  # yapf:disable
-            attention_x = attention_x_tmp
-            return outputs, cache, attention_x, attention_h
-
-        return _fn, cache, attention_x, attention_h
-
-    def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations,
-                                  mode, memory, memory_sequence_length):
-        batch_size = tf.shape(init_decoder_input)[0]
-        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
-            mode,
-            batch_size,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-
-        outputs, attention_x, attention_h, cache = self.dynamic_decode(
-            step_fn,
-            init_decoder_input,
-            init_cache=init_cache,
-            init_attn_x=init_attn_x,
-            init_attn_h=init_attn_h,
-            maximum_iterations=maximum_iterations,
-            batch_size=batch_size)
-        return outputs, attention_x, attention_h
-
-    def dynamic_decode_and_search_teacher_forcing(self, decoder_input,
-                                                  maximum_iterations, mode,
-                                                  memory,
-                                                  memory_sequence_length):
-        batch_size = tf.shape(decoder_input)[0]
-        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
-            mode,
-            batch_size,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-
-        outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing(
-            step_fn,
-            decoder_input,
-            init_cache=init_cache,
-            init_attn_x=init_attn_x,
-            init_attn_h=init_attn_h,
-            maximum_iterations=maximum_iterations,
-            batch_size=batch_size)
-        return outputs, attention_x, attention_h
-
-    def dynamic_decode(self,
-                       step_fn,
-                       init_decoder_input,
-                       init_cache=None,
-                       init_attn_x=None,
-                       init_attn_h=None,
-                       maximum_iterations=None,
-                       batch_size=None):
-
-        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
-            return tf.less(step, maximum_iterations)
-
-        def _body(step, cache, inputs, outputs, attention_x, attention_h):
-            # output: [1, 1, num_mels * r]
-            # attn: [1, 1, T_out]
-            output, cache, attn_x, attn_h = step_fn(
-                step, inputs, cache)  # outputs, cache, attention, attns
-            for layer in range(len(attention_x)):
-                attention_x[layer] = attention_x[layer].write(
-                    step, tf.cast(attn_x[layer], tf.float32))
-
-            for layer in range(len(attention_h)):
-                attention_h[layer] = attention_h[layer].write(
-                    step, tf.cast(attn_h[layer], tf.float32))
-
-            outputs = outputs.write(step, tf.cast(output, tf.float32))
-            return step + 1, cache, output[:, :, -self.
-                                           num_mels:], outputs, attention_x, attention_h
-
-        step = tf.constant(0, dtype=tf.int32)
-        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
-
-        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
-            _cond,
-            _body,
-            loop_vars=(step, init_cache, init_decoder_input, outputs,
-                       init_attn_x, init_attn_h),
-            shape_invariants=(step.shape,
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_cache),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants,
-                                  init_decoder_input), tf.TensorShape(None),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_x),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_h)),
-            parallel_iterations=1,
-            back_prop=False,
-            maximum_iterations=maximum_iterations)
-        # element of outputs: [N, 1, num_mels * r]
-        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
-        outputs_stack = tf.transpose(
-            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
-        outputs_stack = tf.squeeze(
-            outputs_stack, axis=0)  # [N, T_out, num_mels * r]
-
-        attention_x_stack = []
-        for layer in range(len(attention_x)):
-            attention_x_stack_tmp = attention_x[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_x_stack_tmp = tf.transpose(
-                attention_x_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_x_stack_tmp = tf.squeeze(
-                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_x_stack.append(attention_x_stack_tmp)
-
-        attention_h_stack = []
-        for layer in range(len(attention_h)):
-            attention_h_stack_tmp = attention_h[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_h_stack_tmp = tf.transpose(
-                attention_h_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_h_stack_tmp = tf.squeeze(
-                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_h_stack.append(attention_h_stack_tmp)
-
-        return outputs_stack, attention_x_stack, attention_h_stack, cache
-
-    def dynamic_decode_teacher_forcing(self,
-                                       step_fn,
-                                       decoder_input,
-                                       init_cache=None,
-                                       init_attn_x=None,
-                                       init_attn_h=None,
-                                       maximum_iterations=None,
-                                       batch_size=None):
-
-        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
-            return tf.less(step, maximum_iterations)
-
-        def _body(step, cache, inputs, outputs, attention_x, attention_h):
-            # output: [1, 1, num_mels * r]
-            # attn: [1, 1, T_out]
-            output, cache, attn_x, attn_h = step_fn(
-                step, inputs[:, step:step + 1, :],
-                cache)  # outputs, cache, attention, attns
-            for layer in range(len(attention_x)):
-                attention_x[layer] = attention_x[layer].write(
-                    step, tf.cast(attn_x[layer], tf.float32))
-
-            for layer in range(len(attention_h)):
-                attention_h[layer] = attention_h[layer].write(
-                    step, tf.cast(attn_h[layer], tf.float32))
-            outputs = outputs.write(step, tf.cast(output, tf.float32))
-            return step + 1, cache, inputs, outputs, attention_x, attention_h
-
-        step = tf.constant(0, dtype=tf.int32)
-        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
-
-        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
-            _cond,
-            _body,
-            loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x,
-                       init_attn_h),
-            shape_invariants=(step.shape,
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants,
-                                  init_cache), decoder_input.shape,
-                              tf.TensorShape(None),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_x),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_h)),
-            parallel_iterations=1,
-            back_prop=False,
-            maximum_iterations=maximum_iterations)
-        # element of outputs: [N, 1, num_mels * r]
-        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
-        outputs_stack = tf.transpose(
-            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
-        outputs_stack = tf.squeeze(
-            outputs_stack, axis=0)  # [N, T_out, num_mels * r]
-
-        attention_x_stack = []
-        for layer in range(len(attention_x)):
-            attention_x_stack_tmp = attention_x[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_x_stack_tmp = tf.transpose(
-                attention_x_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_x_stack_tmp = tf.squeeze(
-                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_x_stack.append(attention_x_stack_tmp)
-
-        attention_h_stack = []
-        for layer in range(len(attention_h)):
-            attention_h_stack_tmp = attention_h[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_h_stack_tmp = tf.transpose(
-                attention_h_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_h_stack_tmp = tf.squeeze(
-                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_h_stack.append(attention_h_stack_tmp)
-
-        return outputs_stack, attention_x_stack, attention_h_stack, cache
-
-    def _get_shape_invariants(self, tensor):
-        """Returns the shape of the tensor but sets middle dims to None."""
-        if isinstance(tensor, tf.TensorArray):
-            shape = None
-        else:
-            shape = tensor.shape.as_list()
-            for i in range(1, len(shape) - 1):
-                shape[i] = None
-        return tf.TensorShape(shape)
-
-
-class SelfAttentionDecoderOri():
-    """Decoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder(),
-                 self_attention_type='scaled_dot'):
-        """Initializes the parameters of the decoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
-            insensitive).
-
-        Raises:
-          ValueError: if :obj:`self_attention_type` is invalid.
-        """
-        super(SelfAttentionDecoderOri, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-        self.self_attention_type = self_attention_type.lower()
-        if self.self_attention_type not in ('scaled_dot', 'average'):
-            raise ValueError('invalid attention type %s'
-                             % self.self_attention_type)
-        if self.self_attention_type == 'average':
-            tf.logging.warning(
-                'Support for average attention network is experimental '
-                'and may change in future versions.')
-
-    @property
-    def output_size(self):
-        """Returns the decoder output size."""
-        return self.num_units
-
-    @property
-    def support_alignment_history(self):
-        return True
-
-    @property
-    def support_multi_source(self):
-        return True
-
-    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
-        cache = {}
-
-        for layer in range(self.num_layers):
-            proj_cache_shape = [
-                batch_size, self.num_heads, 0, self.num_units // self.num_heads
-            ]
-            layer_cache = {}
-            layer_cache['memory'] = [{
-                'memory_keys':
-                tf.zeros(proj_cache_shape, dtype=dtype),
-                'memory_values':
-                tf.zeros(proj_cache_shape, dtype=dtype)
-            } for _ in range(num_sources)]
-            if self.self_attention_type == 'scaled_dot':
-                layer_cache['self_keys'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-                layer_cache['self_values'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-            elif self.self_attention_type == 'average':
-                layer_cache['prev_g'] = tf.zeros(
-                    [batch_size, 1, self.num_units], dtype=dtype)
-            cache['layer_{}'.format(layer)] = layer_cache
-
-        return cache
-
-    def _self_attention_stack(self,
-                              inputs,
-                              sequence_length=None,
-                              mode=True,
-                              cache=None,
-                              memory=None,
-                              memory_sequence_length=None,
-                              step=None):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(
-                inputs, position=step + 1 if step is not None else None)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        decoder_mask = None
-        memory_mask = None
-        last_attention = None
-
-        if self.self_attention_type == 'scaled_dot':
-            if sequence_length is not None:
-                decoder_mask = transformer.build_future_mask(
-                    sequence_length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(inputs)[1])
-        elif self.self_attention_type == 'average':
-            if cache is None:
-                if sequence_length is None:
-                    sequence_length = tf.fill([tf.shape(inputs)[0]],
-                                              tf.shape(inputs)[1])
-                decoder_mask = transformer.cumulative_average_mask(
-                    sequence_length,
-                    maximum_length=tf.shape(inputs)[1],
-                    dtype=inputs.dtype)
-
-        if memory is not None and not tf.contrib.framework.nest.is_sequence(
-                memory):
-            memory = (memory, )
-        if memory_sequence_length is not None:
-            if not tf.contrib.framework.nest.is_sequence(
-                    memory_sequence_length):
-                memory_sequence_length = (memory_sequence_length, )
-            memory_mask = [
-                transformer.build_sequence_mask(
-                    length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(m)[1])
-                for m, length in zip(memory, memory_sequence_length)
-            ]
-
-        for layer in range(self.num_layers):
-            layer_name = 'layer_{}'.format(layer)
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.variable_scope(layer_name):
-                if self.self_attention_type == 'scaled_dot':
-                    with tf.variable_scope('masked_multi_head'):
-                        encoded = transformer.multi_head_attention(
-                            self.num_heads,
-                            transformer.norm(inputs),
-                            None,
-                            mode,
-                            num_units=self.num_units,
-                            mask=decoder_mask,
-                            cache=layer_cache,
-                            dropout=self.attention_dropout)
-                        last_context = transformer.drop_and_add(
-                            inputs, encoded, mode, dropout=self.dropout)
-                elif self.self_attention_type == 'average':
-                    with tf.variable_scope('average_attention'):
-                        # Cumulative average.
-                        x = transformer.norm(inputs)
-                        y = transformer.cumulative_average(
-                            x,
-                            decoder_mask if cache is None else step,
-                            cache=layer_cache)
-                        # FFN.
-                        y = transformer.feed_forward(
-                            y,
-                            self.ffn_inner_dim,
-                            mode,
-                            dropout=self.relu_dropout)
-                        # Gating layer.
-                        z = tf.layers.dense(
-                            tf.concat([x, y], -1), self.num_units * 2)
-                        i, f = tf.split(z, 2, axis=-1)
-                        y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
-                        last_context = transformer.drop_and_add(
-                            inputs, y, mode, dropout=self.dropout)
-
-                if memory is not None:
-                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
-                        memory_cache = layer_cache['memory'][i] if layer_cache is not None else None  # yapf:disable
-                        with tf.variable_scope('multi_head' if i
-                                               == 0 else 'multi_head_%d' % i):  # yapf:disable
-                            context, last_attention = transformer.multi_head_attention(
-                                self.num_heads,
-                                transformer.norm(last_context),
-                                mem,
-                                mode,
-                                mask=mask,
-                                cache=memory_cache,
-                                dropout=self.attention_dropout,
-                                return_attention=True)
-                            last_context = transformer.drop_and_add(
-                                last_context,
-                                context,
-                                mode,
-                                dropout=self.dropout)
-                            if i > 0:  # Do not return attention in case of multi source.
-                                last_attention = None
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(last_context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        last_context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-
-        if last_attention is not None:
-            # The first head of the last layer is returned.
-            first_head_attention = last_attention[:, 0]
-        else:
-            first_head_attention = None
-
-        outputs = transformer.norm(inputs)
-        return outputs, first_head_attention
-
-    def decode_from_inputs(self,
-                           inputs,
-                           sequence_length,
-                           initial_state=None,
-                           mode=True,
-                           memory=None,
-                           memory_sequence_length=None):
-        outputs, attention = self._self_attention_stack(
-            inputs,
-            sequence_length=sequence_length,
-            mode=mode,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-        return outputs, None, attention
-
-    def step_fn(self,
-                mode,
-                batch_size,
-                initial_state=None,
-                memory=None,
-                memory_sequence_length=None,
-                dtype=tf.float32):
-        if memory is None:
-            num_sources = 0
-        elif tf.contrib.framework.nest.is_sequence(memory):
-            num_sources = len(memory)
-        else:
-            num_sources = 1
-        cache = self._init_cache(
-            batch_size, dtype=dtype, num_sources=num_sources)
-
-        def _fn(step, inputs, cache, mode):
-            inputs = tf.expand_dims(inputs, 1)
-            outputs, attention = self._self_attention_stack(
-                inputs,
-                mode=mode,
-                cache=cache,
-                memory=memory,
-                memory_sequence_length=memory_sequence_length,
-                step=step)
-            outputs = tf.squeeze(outputs, axis=1)
-            if attention is not None:
-                attention = tf.squeeze(attention, axis=1)
-            return outputs, cache, attention
-
-        return _fn, cache
diff --git a/modelscope/models/audio/tts/models/self_attention_encoder.py b/modelscope/models/audio/tts/models/self_attention_encoder.py
deleted file mode 100755
index ce4193dc..00000000
--- a/modelscope/models/audio/tts/models/self_attention_encoder.py
+++ /dev/null
@@ -1,182 +0,0 @@
-"""Define the self-attention encoder."""
-
-import tensorflow as tf
-
-from . import transformer
-from .position import SinusoidalPositionEncoder
-
-
-class SelfAttentionEncoder():
-    """Encoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder()):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(SelfAttentionEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-        mask = transformer.build_sequence_mask(
-            sequence_length,
-            num_heads=self.num_heads,
-            maximum_length=tf.shape(inputs)[1])
-
-        mask_FF = tf.squeeze(
-            transformer.build_sequence_mask(
-                sequence_length, maximum_length=tf.shape(inputs)[1]),
-            axis=1)
-
-        state = ()
-
-        attns = []
-        for layer in range(self.num_layers):
-            with tf.variable_scope('layer_{}'.format(layer)):
-                with tf.variable_scope('multi_head'):
-                    context, attn = transformer.multi_head_attention(
-                        self.num_heads,
-                        transformer.norm(inputs),
-                        None,
-                        mode,
-                        num_units=self.num_units,
-                        mask=mask,
-                        dropout=self.attention_dropout,
-                        return_attention=True)
-                    attns.append(attn)
-                    context = transformer.drop_and_add(
-                        inputs, context, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout,
-                        mask=mask_FF)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = transformer.norm(inputs)
-        return (outputs, state, sequence_length, attns)
-
-
-class SelfAttentionEncoderOri():
-    """Encoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder()):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(SelfAttentionEncoderOri, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-        mask = transformer.build_sequence_mask(
-            sequence_length,
-            num_heads=self.num_heads,
-            maximum_length=tf.shape(inputs)[1])  # [N, 1, 1, T_out]
-
-        state = ()
-
-        attns = []
-        for layer in range(self.num_layers):
-            with tf.variable_scope('layer_{}'.format(layer)):
-                with tf.variable_scope('multi_head'):
-                    context, attn = transformer.multi_head_attention(
-                        self.num_heads,
-                        transformer.norm(inputs),
-                        None,
-                        mode,
-                        num_units=self.num_units,
-                        mask=mask,
-                        dropout=self.attention_dropout,
-                        return_attention=True)
-                    attns.append(attn)
-                    context = transformer.drop_and_add(
-                        inputs, context, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = transformer.norm(inputs)
-        return (outputs, state, sequence_length, attns)
diff --git a/modelscope/models/audio/tts/models/transformer.py b/modelscope/models/audio/tts/models/transformer.py
deleted file mode 100755
index a9f0bedc..00000000
--- a/modelscope/models/audio/tts/models/transformer.py
+++ /dev/null
@@ -1,1157 +0,0 @@
-"""Define layers related to the Google's Transformer model."""
-
-import tensorflow as tf
-
-from . import compat, fsmn
-
-
-def tile_sequence_length(sequence_length, num_heads):
-    """Tiles lengths :obj:`num_heads` times.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-
-    Returns:
-      A ``tf.Tensor`` where each length is replicated :obj:`num_heads` times.
-    """
-    sequence_length = tf.tile(sequence_length, [num_heads])
-    sequence_length = tf.reshape(sequence_length, [num_heads, -1])
-    sequence_length = tf.transpose(sequence_length, perm=[1, 0])
-    sequence_length = tf.reshape(sequence_length, [-1])
-    return sequence_length
-
-
-def build_sequence_mask(sequence_length,
-                        num_heads=None,
-                        maximum_length=None,
-                        dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, 1, max_length]``.
-    """
-    mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = tf.expand_dims(mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def build_sequence_mask_window(sequence_length,
-                               left_window_size=-1,
-                               right_window_size=-1,
-                               num_heads=None,
-                               maximum_length=None,
-                               dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, 1, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _window_mask(
-        sequence_length,
-        left_window_size=left_window_size,
-        right_window_size=right_window_size,
-        maximum_length=maximum_length,
-        dtype=dtype)
-    mask *= tf.expand_dims(sequence_mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def _lower_triangle_mask(sequence_length,
-                         maximum_length=None,
-                         dtype=tf.float32,
-                         band=-1):
-    batch_size = tf.shape(sequence_length)[0]
-    if maximum_length is None:
-        maximum_length = tf.reduce_max(sequence_length)
-    mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype)
-    mask = compat.tf_compat(
-        v2='linalg.band_part', v1='matrix_band_part')(mask, band, 0)
-    return mask
-
-
-def _higher_triangle_mask(sequence_length,
-                          maximum_length=None,
-                          dtype=tf.float32,
-                          band=-1):
-    batch_size = tf.shape(sequence_length)[0]
-    if maximum_length is None:
-        maximum_length = tf.reduce_max(sequence_length)
-    mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype)
-    mask = compat.tf_compat(
-        v2='linalg.band_part', v1='matrix_band_part')(mask, 0, band)
-    return mask
-
-
-def _window_mask(sequence_length,
-                 left_window_size=-1,
-                 right_window_size=-1,
-                 maximum_length=None,
-                 dtype=tf.float32):
-    batch_size = tf.shape(sequence_length)[0]
-    if maximum_length is None:
-        maximum_length = tf.reduce_max(sequence_length)
-    mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype)
-    left_window_size = tf.minimum(
-        tf.cast(left_window_size, tf.int64),
-        tf.cast(maximum_length - 1, tf.int64))
-    right_window_size = tf.minimum(
-        tf.cast(right_window_size, tf.int64),
-        tf.cast(maximum_length - 1, tf.int64))
-    mask = tf.matrix_band_part(mask, left_window_size, right_window_size)
-    return mask
-
-
-def build_future_mask(sequence_length,
-                      num_heads=None,
-                      maximum_length=None,
-                      dtype=tf.float32,
-                      band=-1):
-    """Builds the dot product mask for future positions.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, max_length, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _lower_triangle_mask(
-        sequence_length, maximum_length=maximum_length, dtype=dtype, band=band)
-    mask *= tf.expand_dims(sequence_mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def build_history_mask(sequence_length,
-                       num_heads=None,
-                       maximum_length=None,
-                       dtype=tf.float32,
-                       band=-1):
-    """Builds the dot product mask for future positions.
-
-    Args:
-      sequence_length: The sequence length.
-      num_heads: The number of heads.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, 1, max_length, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _higher_triangle_mask(
-        sequence_length, maximum_length=maximum_length, dtype=dtype, band=band)
-    mask *= tf.expand_dims(sequence_mask, axis=1)
-    if num_heads is not None:
-        mask = tf.expand_dims(mask, axis=1)
-    return mask
-
-
-def cumulative_average_mask(sequence_length,
-                            maximum_length=None,
-                            dtype=tf.float32):
-    """Builds the mask to compute the cumulative average as described in
-    https://arxiv.org/abs/1805.00631.
-
-    Args:
-      sequence_length: The sequence length.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, max_length, max_length]``.
-    """
-    sequence_mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-    mask = _lower_triangle_mask(
-        sequence_length, maximum_length=maximum_length, dtype=dtype)
-    mask *= tf.expand_dims(sequence_mask, axis=2)
-    weight = tf.range(1, tf.cast(tf.shape(mask)[1] + 1, dtype), dtype=dtype)
-    mask /= tf.expand_dims(weight, 1)
-    return mask
-
-
-def cumulative_average(inputs, mask_or_step, cache=None):
-    """Computes the cumulative average as described in
-    https://arxiv.org/abs/1805.00631.
-
-    Args:
-      inputs: The sequence to average. A tensor of shape :math:`[B, T, D]`.
-      mask_or_step: If :obj:`cache` is set, this is assumed to be the current step
-        of the dynamic decoding. Otherwise, it is the mask matrix used to compute
-        the cumulative average.
-      cache: A dictionnary containing the cumulative average of the previous step.
-
-    Returns:
-      The cumulative average, a tensor of the same shape and type as :obj:`inputs`.
-    """
-    if cache is not None:
-        step = tf.cast(mask_or_step, inputs.dtype)
-        aa = (inputs + step * cache['prev_g']) / (step + 1.0)
-        cache['prev_g'] = aa
-        return aa
-    else:
-        mask = mask_or_step
-        return tf.matmul(mask, inputs)
-
-
-def fused_projection(inputs, num_units, num_outputs=1):
-    """Projects the same input into multiple output spaces.
-
-    Args:
-      inputs: The inputs to project.
-      num_units: The number of output units of each space.
-      num_outputs: The number of output spaces.
-
-    Returns:
-      :obj:`num_outputs` ``tf.Tensor`` of depth :obj:`num_units`.
-    """
-    return tf.split(
-        tf.layers.conv1d(inputs, num_units * num_outputs, 1),
-        num_outputs,
-        axis=2)
-
-
-def split_heads(inputs, num_heads):
-    """Splits a tensor in depth.
-
-    Args:
-      inputs: A ``tf.Tensor`` of shape :math:`[B, T, D]`.
-      num_heads: The number of heads :math:`H`.
-
-    Returns:
-      A ``tf.Tensor`` of shape :math:`[B, H, T, D / H]`.
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    outputs = tf.reshape(inputs, [
-        tf.shape(inputs)[0],
-        tf.shape(inputs)[1], num_heads, depth // num_heads
-    ])
-    outputs = tf.transpose(outputs, perm=[0, 2, 1, 3])
-    return outputs
-
-
-def combine_heads(inputs):
-    """Concatenates heads.
-
-    Args:
-      inputs: A ``tf.Tensor`` of shape :math:`[B, H, T, D]`.
-
-    Returns:
-      A ``tf.Tensor`` of shape :math:`[B, T, D * H]`.
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    num_heads = static_shape[1]
-    outputs = tf.transpose(inputs, perm=[0, 2, 1, 3])
-    outputs = tf.reshape(
-        outputs,
-        [tf.shape(outputs)[0],
-         tf.shape(outputs)[1], depth * num_heads])
-    return outputs
-
-
-def dot_product_attention(queries, keys, values, mode, mask=None, dropout=0.0):
-    """Computes the dot product attention.
-
-    Args:
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      keys: The sequence use to calculate attention scores. A tensor of shape
-        :math:`[B, T_2, ...]`.
-      values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      dropout: The probability to drop units from the inputs.
-
-    Returns:
-      A tuple ``(context vector, attention vector)``.
-    """
-    dot = tf.matmul(queries, keys, transpose_b=True)
-
-    if mask is not None:
-        dot = tf.cast(
-            tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min),
-            dot.dtype)
-
-    softmax = tf.nn.softmax(tf.cast(dot, tf.float32))
-    attn = tf.cast(softmax, dot.dtype)
-    drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode)
-
-    context = tf.matmul(drop_attn, values)
-
-    return context, attn
-
-
-def dot_product_attention_wpa(num_heads,
-                              queries,
-                              keys,
-                              values,
-                              mode,
-                              attention_left_window=-1,
-                              attention_right_window=0,
-                              mask=None,
-                              max_id_cache=None,
-                              mono=False,
-                              peak_delay=-1,
-                              dropout=0.0):
-    """
-    Computes the dot product attention.
-    Args:
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      keys: The sequence use to calculate attention scores. A tensor of shape
-        :math:`[B, T_2, ...]`.
-      values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      dropout: The probability to drop units from the inputs.
-
-    Returns:
-      A tuple ``(context vector, attention vector)``.
-    """
-    # Dot product between queries and keys.
-    dot = tf.matmul(queries, keys, transpose_b=True)
-    depth = tf.shape(dot)[-1]
-    if mask is not None:
-        dot = tf.cast(
-            tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min),
-            dot.dtype)
-    # wpa
-    max_id = tf.math.argmax(input=dot, axis=-1)
-    # peak delay
-    if peak_delay > 0:
-        if max_id_cache is not None:
-            M = tf.cast(max_id_cache['pre_max_id'], dtype=max_id.dtype)
-            inputs_len = tf.math.minimum(
-                M + peak_delay, tf.cast(depth - 1, dtype=max_id.dtype))
-            delay_mask = tf.sequence_mask(
-                inputs_len, maxlen=depth, dtype=tf.float32)
-            dot = tf.cast(
-                tf.cast(dot, tf.float32) * delay_mask
-                + ((1.0 - delay_mask) * tf.float32.min), dot.dtype)  # yapf:disable
-            max_id = tf.math.argmax(input=dot, axis=-1)
-    # mono
-    if mono:
-        if max_id_cache is None:
-            d = tf.shape(max_id)[-1]
-            tmp_max_id = tf.reshape(max_id, [-1, num_heads, d])
-            tmp_max_id = tf.slice(
-                tmp_max_id, [0, 0, 0],
-                [tf.shape(tmp_max_id)[0],
-                 tf.shape(tmp_max_id)[1], d - 1])
-            zeros = tf.zeros(
-                shape=(tf.shape(tmp_max_id)[0], tf.shape(tmp_max_id)[1], 1),
-                dtype=max_id.dtype)
-            tmp_max_id = tf.concat([zeros, tmp_max_id], axis=-1)
-            mask1 = tf.sequence_mask(
-                tmp_max_id, maxlen=depth, dtype=tf.float32)
-            dot = tf.cast(
-                tf.cast(dot, tf.float32)
-                * (1.0 - mask1) + mask1 * tf.float32.min, dot.dtype)  # yapf:disable
-            max_id = tf.math.argmax(input=dot, axis=-1)
-        else:
-            # eval
-            tmp_max_id = tf.reshape(max_id, [-1, num_heads, 1])
-            max_id_cache['pre_max_id'] = tmp_max_id
-    # right_mask
-    right_offset = tf.constant(attention_right_window, dtype=max_id.dtype)
-    right_len = tf.math.minimum(max_id + right_offset,
-                                tf.cast(depth - 1, dtype=max_id.dtype))
-    right_mask = tf.sequence_mask(right_len, maxlen=depth, dtype=tf.float32)
-    dot = tf.cast(
-        tf.cast(dot, tf.float32) * right_mask
-        + ((1.0 - right_mask) * tf.float32.min), dot.dtype)  # yapf:disable
-    # left_mask
-    if attention_left_window > 0:
-        left_offset = tf.constant(attention_left_window, dtype=max_id.dtype)
-        left_len = tf.math.maximum(max_id - left_offset,
-                                   tf.cast(0, dtype=max_id.dtype))
-        left_mask = tf.sequence_mask(left_len, maxlen=depth, dtype=tf.float32)
-        dot = tf.cast(
-            tf.cast(dot, tf.float32) * (1.0 - left_mask)
-            + (left_mask * tf.float32.min), dot.dtype)  # yapf:disable
-    # Compute attention weights.
-    attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype)
-    drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode)
-
-    # Compute attention context.
-    context = tf.matmul(drop_attn, values)
-
-    return context, attn
-
-
-def multi_head_attention(num_heads,
-                         queries,
-                         memory,
-                         mode,
-                         num_units=None,
-                         mask=None,
-                         cache=None,
-                         dropout=0.0,
-                         return_attention=False):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    if memory is None:
-        queries, keys, values = fused_projection(
-            queries, num_units, num_outputs=3)
-
-        keys = split_heads(keys, num_heads)
-        values = split_heads(values, num_heads)
-
-        if cache is not None:
-            keys = tf.concat([cache['self_keys'], keys], axis=2)
-            values = tf.concat([cache['self_values'], values], axis=2)
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-    else:
-        queries = tf.layers.conv1d(queries, num_units, 1)
-
-        if cache is not None:
-
-            def _project_and_split():
-                k, v = fused_projection(memory, num_units, num_outputs=2)
-                return split_heads(k, num_heads), split_heads(v, num_heads)
-
-            keys, values = tf.cond(
-                tf.equal(tf.shape(cache['memory_keys'])[2], 0),
-                true_fn=_project_and_split,
-                false_fn=lambda:
-                (cache['memory_keys'], cache['memory_values']))
-            cache['memory_keys'] = keys
-            cache['memory_values'] = values
-        else:
-            keys, values = fused_projection(memory, num_units, num_outputs=2)
-            keys = split_heads(keys, num_heads)
-            values = split_heads(values, num_heads)
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def multi_head_attention_PNCA(num_heads,
-                              queries,
-                              memory,
-                              mode,
-                              num_units=None,
-                              mask=None,
-                              mask_h=None,
-                              cache=None,
-                              cache_h=None,
-                              dropout=0.0,
-                              return_attention=False,
-                              X_band_width=None,
-                              layer_name='multi_head'):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    # X
-    queries, keys, values = fused_projection(queries, num_units, num_outputs=3)
-
-    keys = split_heads(keys, num_heads)
-    values = split_heads(values, num_heads)
-
-    if cache is not None:
-        keys = tf.concat([cache['self_keys'], keys], axis=2)
-        values = tf.concat([cache['self_values'], values], axis=2)
-        if X_band_width is not None:
-            keys_band = tf.cond(
-                tf.less(X_band_width, 0), lambda: keys, lambda: tf.cond(
-                    tf.less(tf.shape(keys)[2], X_band_width), lambda: keys,
-                    lambda: keys[:, :, -X_band_width:, :])
-            )  # not support X_band_width == 0
-            values_band = tf.cond(
-                tf.less(X_band_width, 0), lambda: values, lambda: tf.cond(
-                    tf.less(tf.shape(values)[2], X_band_width), lambda: values,
-                    lambda: values[:, :, -X_band_width:, :]))
-            cache['self_keys'] = keys_band
-            cache['self_values'] = values_band
-        else:
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    # H
-    if cache_h is not None:
-
-        def _project_and_split():
-            k, v = fused_projection(memory, num_units, num_outputs=2)
-            return split_heads(k, num_heads), split_heads(v, num_heads)
-
-        keys_h, values_h = tf.cond(
-            tf.equal(tf.shape(cache_h['memory_keys'])[2], 0),
-            true_fn=_project_and_split,
-            false_fn=lambda:
-            (cache_h['memory_keys'], cache_h['memory_values']))
-        cache_h['memory_keys'] = keys_h
-        cache_h['memory_values'] = values_h
-    else:
-        keys_h, values_h = fused_projection(memory, num_units, num_outputs=2)
-        keys_h = split_heads(keys_h, num_heads)
-        values_h = split_heads(values_h, num_heads)
-
-    heads_h, attn_h = dot_product_attention(
-        queries, keys_h, values_h, mode, mask=mask_h, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined_h = combine_heads(heads_h)
-    outputs_h = tf.layers.conv1d(combined_h, num_units, 1)
-
-    # ADD
-    outputs = outputs + outputs_h
-
-    # RETURN
-    return outputs, attn, attn_h
-
-
-def multi_head_attention_memory(num_heads,
-                                queries,
-                                memory,
-                                mode,
-                                num_memory=None,
-                                num_units=None,
-                                mask=None,
-                                cache=None,
-                                dropout=0.0,
-                                return_attention=False):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    # PERSISTENT MEMORY
-    # key memory
-    if num_memory is not None:
-        key_m = tf.get_variable(
-            'key_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-        # value memory
-        value_m = tf.get_variable(
-            'value_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-    if memory is None:
-        queries, keys, values = fused_projection(
-            queries, num_units, num_outputs=3)
-
-        # concat memory
-        if num_memory is not None:
-            key_m_expand = tf.tile(
-                tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1])
-            value_m_expand = tf.tile(
-                tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1])
-            keys = tf.concat([key_m_expand, keys], axis=1)
-            values = tf.concat([value_m_expand, values], axis=1)
-
-        keys = split_heads(keys, num_heads)
-        values = split_heads(values, num_heads)
-
-        if cache is not None:
-            keys = tf.concat([cache['self_keys'], keys], axis=2)
-            values = tf.concat([cache['self_values'], values], axis=2)
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-    else:
-        queries = tf.layers.conv1d(queries, num_units, 1)
-
-        if cache is not None:
-
-            def _project_and_split():
-                k, v = fused_projection(memory, num_units, num_outputs=2)
-                return split_heads(k, num_heads), split_heads(v, num_heads)
-
-            keys, values = tf.cond(
-                tf.equal(tf.shape(cache['memory_keys'])[2], 0),
-                true_fn=_project_and_split,
-                false_fn=lambda:
-                (cache['memory_keys'], cache['memory_values']))
-            cache['memory_keys'] = keys
-            cache['memory_values'] = values
-        else:
-            keys, values = fused_projection(memory, num_units, num_outputs=2)
-            keys = split_heads(keys, num_heads)
-            values = split_heads(values, num_heads)
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def Ci_Cd_Memory(num_heads,
-                 queries,
-                 mode,
-                 filter_size=None,
-                 num_memory=None,
-                 num_units=None,
-                 fsmn_mask=None,
-                 san_mask=None,
-                 cache=None,
-                 shift=None,
-                 dropout=0.0,
-                 return_attention=False):
-    """
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-    # PERSISTENT MEMORY
-    if num_memory is not None:
-        key_m = tf.get_variable(
-            'key_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-        value_m = tf.get_variable(
-            'value_m',
-            shape=[num_memory, num_units],
-            initializer=tf.glorot_uniform_initializer(),
-            dtype=tf.float32)
-
-    queries, keys, values = fused_projection(queries, num_units, num_outputs=3)
-    # fsmn memory block
-    if shift is not None:
-        # encoder
-        fsmn_memory = fsmn.MemoryBlockV2(
-            values,
-            filter_size,
-            mode,
-            shift=shift,
-            mask=fsmn_mask,
-            dropout=dropout)
-    else:
-        # decoder
-        fsmn_memory = fsmn.UniMemoryBlock(
-            values,
-            filter_size,
-            mode,
-            cache=cache,
-            mask=fsmn_mask,
-            dropout=dropout)
-
-    # concat persistent memory
-    if num_memory is not None:
-        key_m_expand = tf.tile(
-            tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1])
-        value_m_expand = tf.tile(
-            tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1])
-        keys = tf.concat([key_m_expand, keys], axis=1)
-        values = tf.concat([value_m_expand, values], axis=1)
-
-    keys = split_heads(keys, num_heads)
-    values = split_heads(values, num_heads)
-
-    if cache is not None:
-        keys = tf.concat([cache['self_keys'], keys], axis=2)
-        values = tf.concat([cache['self_values'], values], axis=2)
-        cache['self_keys'] = keys
-        cache['self_values'] = values
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention(
-        queries, keys, values, mode, mask=san_mask, dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-    outputs = outputs + fsmn_memory
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def multi_head_attention_wpa(num_heads,
-                             queries,
-                             memory,
-                             mode,
-                             attention_left_window=-1,
-                             attention_right_window=0,
-                             num_units=None,
-                             mask=None,
-                             cache=None,
-                             max_id_cache=None,
-                             dropout=0.0,
-                             mono=False,
-                             peak_delay=-1,
-                             return_attention=False):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Args:
-      num_heads: The number of attention heads.
-      queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-      memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-        If ``None``, computes self-attention.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      num_units: The number of hidden units. If not set, it is set to the input
-        dimension.
-      mask: A ``tf.Tensor`` applied to the dot product.
-      cache: A dictionary containing pre-projected keys and values.
-      dropout: The probability to drop units from the inputs.
-      return_attention: Return the attention head probabilities in addition to the
-        context.
-
-    Returns:
-      The concatenated attention context of each head and the attention
-      probabilities (if :obj:`return_attention` is set).
-    """
-    num_units = num_units or queries.get_shape().as_list()[-1]
-
-    if num_units % num_heads != 0:
-        raise ValueError('Multi head attention requires that num_units is a'
-                         ' multiple of {}'.format(num_heads))
-
-    if memory is None:
-        queries, keys, values = fused_projection(
-            queries, num_units, num_outputs=3)
-
-        keys = split_heads(keys, num_heads)
-        values = split_heads(values, num_heads)
-
-        if cache is not None:
-            keys = tf.concat([cache['self_keys'], keys], axis=2)
-            values = tf.concat([cache['self_values'], values], axis=2)
-            cache['self_keys'] = keys
-            cache['self_values'] = values
-    else:
-        queries = tf.layers.conv1d(queries, num_units, 1)
-
-        if cache is not None:
-
-            def _project_and_split():
-                k, v = fused_projection(memory, num_units, num_outputs=2)
-                return split_heads(k, num_heads), split_heads(v, num_heads)
-
-            keys, values = tf.cond(
-                tf.equal(tf.shape(cache['memory_keys'])[2], 0),
-                true_fn=_project_and_split,
-                false_fn=lambda:
-                (cache['memory_keys'], cache['memory_values']))
-            cache['memory_keys'] = keys
-            cache['memory_values'] = values
-        else:
-            keys, values = fused_projection(memory, num_units, num_outputs=2)
-            keys = split_heads(keys, num_heads)
-            values = split_heads(values, num_heads)
-
-    queries = split_heads(queries, num_heads)
-    queries *= (num_units // num_heads)**-0.5
-
-    heads, attn = dot_product_attention_wpa(
-        num_heads,
-        queries,
-        keys,
-        values,
-        mode,
-        attention_left_window=attention_left_window,
-        attention_right_window=attention_right_window,
-        mask=mask,
-        max_id_cache=max_id_cache,
-        mono=mono,
-        peak_delay=peak_delay,
-        dropout=dropout)
-
-    # Concatenate all heads output.
-    combined = combine_heads(heads)
-    outputs = tf.layers.conv1d(combined, num_units, 1)
-
-    if not return_attention:
-        return outputs
-    return outputs, attn
-
-
-def feed_forward(x, inner_dim, mode, dropout=0.0, mask=None):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2
-
-    Args:
-      x: The input.
-      inner_dim: The number of units of the inner linear transformation.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    input_dim = x.get_shape().as_list()[-1]
-
-    if mask is not None:
-        x = x * tf.expand_dims(mask, -1)
-
-    inner = tf.layers.conv1d(
-        x, inner_dim, 3, padding='same', activation=tf.nn.relu)
-
-    if mask is not None:
-        inner = inner * tf.expand_dims(mask, -1)
-    inner = tf.layers.dropout(inner, rate=dropout, training=mode)
-    outer = tf.layers.conv1d(inner, input_dim, 1)
-
-    return outer
-
-
-def feed_forward_ori(x, inner_dim, mode, dropout=0.0):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2
-
-    Args:
-      x: The input.
-      inner_dim: The number of units of the inner linear transformation.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    input_dim = x.get_shape().as_list()[-1]
-
-    inner = tf.layers.conv1d(x, inner_dim, 1, activation=tf.nn.relu)
-    inner = tf.layers.dropout(inner, rate=dropout, training=mode)
-    outer = tf.layers.conv1d(inner, input_dim, 1)
-
-    return outer
-
-
-def norm(inputs):
-    """Layer normalizes :obj:`inputs`."""
-    return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)
-
-
-def drop_and_add(inputs, outputs, mode, dropout=0.1):
-    """Drops units in the outputs and adds the previous values.
-
-    Args:
-      inputs: The input of the previous layer.
-      outputs: The output of the previous layer.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units in :obj:`outputs`.
-
-    Returns:
-      The residual and normalized output.
-    """
-    outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)
-
-    input_dim = inputs.get_shape().as_list()[-1]
-    output_dim = outputs.get_shape().as_list()[-1]
-
-    if input_dim == output_dim:
-        outputs += inputs
-    return outputs
-
-
-class FeedForwardNetwork(tf.keras.layers.Layer):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2
-
-    Note:
-      Object-oriented implementation for TensorFlow 2.0.
-    """
-
-    def __init__(self,
-                 inner_dim,
-                 output_dim,
-                 dropout=0.1,
-                 activation=tf.nn.relu,
-                 **kwargs):
-        """Initializes this layer.
-
-        Args:
-          inner_dim: The number of units of the inner linear transformation.
-          output_dim: The number of units of the ouput linear transformation.
-          dropout: The probability to drop units from the activation output.
-          activation: The activation function to apply between the two linear
-            transformations.
-          kwargs: Additional layer arguments.
-        """
-        super(FeedForwardNetwork, self).__init__(**kwargs)
-        self.inner = tf.keras.layers.Dense(
-            inner_dim, activation=activation, name='inner')
-        self.outer = tf.keras.layers.Dense(output_dim, name='outer')
-        self.dropout = dropout
-
-    def call(self, inputs, training=None):  # pylint: disable=arguments-differ
-        """Runs the layer."""
-        inner = self.inner(inputs)
-        inner = tf.layers.dropout(inner, self.dropout, training=training)
-        return self.outer(inner)
-
-
-class MultiHeadAttention(tf.keras.layers.Layer):
-    """Computes the multi-head attention as described in
-    https://arxiv.org/abs/1706.03762.
-
-    Note:
-      Object-oriented implementation for TensorFlow 2.0.
-    """
-
-    def __init__(self,
-                 num_heads,
-                 num_units,
-                 dropout=0.1,
-                 return_attention=False,
-                 **kwargs):
-        """Initializes this layers.
-
-        Args:
-          num_heads: The number of attention heads.
-          num_units: The number of hidden units.
-          dropout: The probability to drop units from the inputs.
-          return_attention: If ``True``, also return the attention weights of the
-            first head.
-          kwargs: Additional layer arguments.
-        """
-        super(MultiHeadAttention, self).__init__(**kwargs)
-        if num_units % num_heads != 0:
-            raise ValueError(
-                'Multi head attention requires that num_units is a'
-                ' multiple of %s' % num_heads)
-        self.num_heads = num_heads
-        self.num_units = num_units
-        self.linear_queries = tf.keras.layers.Dense(
-            num_units, name='linear_queries')
-        self.linear_keys = tf.keras.layers.Dense(num_units, name='linear_keys')
-        self.linear_values = tf.keras.layers.Dense(
-            num_units, name='linear_values')
-        self.linear_output = tf.keras.layers.Dense(
-            num_units, name='linear_output')
-        self.dropout = dropout
-        self.return_attention = return_attention
-
-    def call(self, inputs, memory=None, mask=None, cache=None, training=None):  # pylint: disable=arguments-differ
-        """Runs the layer.
-
-        Args:
-          inputs: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
-          memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
-            If ``None``, computes self-attention.
-          mask: A ``tf.Tensor`` applied to the dot product.
-          cache: A dictionary containing pre-projected keys and values.
-          training: Run in training mode.
-
-        Returns:
-          A tuple with the attention context, the updated cache and the attention
-          probabilities of the first head (if :obj:`return_attention` is ``True``).
-        """
-
-        def _compute_kv(x):
-            keys = self.linear_keys(x)
-            keys = split_heads(keys, self.num_heads)
-            values = self.linear_values(x)
-            values = split_heads(values, self.num_heads)
-            return keys, values
-
-        # Compute queries.
-        queries = self.linear_queries(inputs)
-        queries = split_heads(queries, self.num_heads)
-        queries *= (self.num_units // self.num_heads)**-0.5
-
-        # Compute keys and values.
-        if memory is None:
-            keys, values = _compute_kv(inputs)
-            if cache:
-                keys = tf.concat([cache[0], keys], axis=2)
-                values = tf.concat([cache[1], values], axis=2)
-        else:
-            if cache:
-                if not self.linear_keys.built:
-                    # Ensure that the variable names are not impacted by the tf.cond name
-                    # scope if the layers have not already been built.
-                    with tf.name_scope(self.linear_keys.name):
-                        self.linear_keys.build(memory.shape)
-                    with tf.name_scope(self.linear_values.name):
-                        self.linear_values.build(memory.shape)
-                keys, values = tf.cond(
-                    tf.equal(tf.shape(cache[0])[2], 0),
-                    true_fn=lambda: _compute_kv(memory),
-                    false_fn=lambda: cache)
-            else:
-                keys, values = _compute_kv(memory)
-
-        cache = (keys, values)
-
-        # Dot product attention.
-        dot = tf.matmul(queries, keys, transpose_b=True)
-        if mask is not None:
-            mask = tf.expand_dims(tf.cast(mask, tf.float32),
-                                  1)  # Broadcast on heads dimension.
-            dot = tf.cast(
-                tf.cast(dot, tf.float32) * mask
-                + ((1.0 - mask) * tf.float32.min), dot.dtype)  # yapf:disable
-        attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype)
-        drop_attn = tf.layers.dropout(attn, self.dropout, training=training)
-        heads = tf.matmul(drop_attn, values)
-
-        # Concatenate all heads output.
-        combined = combine_heads(heads)
-        outputs = self.linear_output(combined)
-        if self.return_attention:
-            return outputs, cache, attn
-        return outputs, cache
diff --git a/modelscope/models/audio/tts/models/utils.py b/modelscope/models/audio/tts/models/utils.py
deleted file mode 100755
index 03e1ef8c..00000000
--- a/modelscope/models/audio/tts/models/utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import glob
-import os
-
-import matplotlib
-import matplotlib.pylab as plt
-import torch
-from torch.nn.utils import weight_norm
-
-matplotlib.use('Agg')
-
-
-def plot_spectrogram(spectrogram):
-    fig, ax = plt.subplots(figsize=(10, 2))
-    im = ax.imshow(
-        spectrogram, aspect='auto', origin='lower', interpolation='none')
-    plt.colorbar(im, ax=ax)
-
-    fig.canvas.draw()
-    plt.close()
-
-    return fig
-
-
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find('Conv') != -1:
-        m.weight.data.normal_(mean, std)
-
-
-def apply_weight_norm(m):
-    classname = m.__class__.__name__
-    if classname.find('Conv') != -1:
-        weight_norm(m)
-
-
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-
-
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    print("Loading '{}'".format(filepath))
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    print('Complete.')
-    return checkpoint_dict
-
-
-def save_checkpoint(filepath, obj):
-    print('Saving checkpoint to {}'.format(filepath))
-    torch.save(obj, filepath)
-    print('Complete.')
-
-
-def scan_checkpoint(cp_dir, prefix):
-    pattern = os.path.join(cp_dir, prefix + '????????')
-    cp_list = glob.glob(pattern)
-    if len(cp_list) == 0:
-        return None
-    return sorted(cp_list)[-1]
diff --git a/modelscope/models/audio/tts/models/utils/__init__.py b/modelscope/models/audio/tts/models/utils/__init__.py
new file mode 100644
index 00000000..e07f08ea
--- /dev/null
+++ b/modelscope/models/audio/tts/models/utils/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .utils import *  # noqa F403
diff --git a/modelscope/models/audio/tts/models/utils/utils.py b/modelscope/models/audio/tts/models/utils/utils.py
new file mode 100755
index 00000000..17ac8aee
--- /dev/null
+++ b/modelscope/models/audio/tts/models/utils/utils.py
@@ -0,0 +1,136 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import glob
+import os
+import shutil
+
+import matplotlib
+import matplotlib.pylab as plt
+import torch
+
+matplotlib.use('Agg')
+
+
+class AttrDict(dict):
+
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
+
+
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(
+        spectrogram, aspect='auto', origin='lower', interpolation='none')
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def plot_alignment(alignment, info=None):
+    fig, ax = plt.subplots()
+    im = ax.imshow(
+        alignment, aspect='auto', origin='lower', interpolation='none')
+    fig.colorbar(im, ax=ax)
+    xlabel = 'Input timestep'
+    if info is not None:
+        xlabel += '\t' + info
+    plt.xlabel(xlabel)
+    plt.ylabel('Output timestep')
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+    torch.save(obj, filepath)
+
+
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????.pkl')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+class ValueWindow():
+
+    def __init__(self, window_size=100):
+        self._window_size = window_size
+        self._values = []
+
+    def append(self, x):
+        self._values = self._values[-(self._window_size - 1):] + [x]
+
+    @property
+    def sum(self):
+        return sum(self._values)
+
+    @property
+    def count(self):
+        return len(self._values)
+
+    @property
+    def average(self):
+        return self.sum / max(1, self.count)
+
+    def reset(self):
+        self._values = []
+
+
+def get_model_size(model):
+    param_num = sum([p.numel() for p in model.parameters() if p.requires_grad])
+    param_size = param_num * 4 / 1024 / 1024
+    return param_size
+
+
+def get_grad_norm(model):
+    total_norm = 0
+    params = [
+        p for p in model.parameters() if p.grad is not None and p.requires_grad
+    ]
+    for p in params:
+        param_norm = p.grad.detach().data.norm(2)
+        total_norm += param_norm.item()**2
+    total_norm = total_norm**0.5
+    return total_norm
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_mask_from_lengths(lengths, max_len=None):
+    batch_size = lengths.shape[0]
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+
+    ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size,
+                                                       -1).to(lengths.device)
+    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
+
+    return mask
diff --git a/modelscope/models/audio/tts/models/vocoder_models.py b/modelscope/models/audio/tts/models/vocoder_models.py
deleted file mode 100755
index c46a9204..00000000
--- a/modelscope/models/audio/tts/models/vocoder_models.py
+++ /dev/null
@@ -1,516 +0,0 @@
-from distutils.version import LooseVersion
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
-from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
-
-from .utils import get_padding, init_weights
-
-is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')
-
-
-def stft(x, fft_size, hop_size, win_length, window):
-    """Perform STFT and convert to magnitude spectrogram.
-
-    Args:
-        x (Tensor): Input signal tensor (B, T).
-        fft_size (int): FFT size.
-        hop_size (int): Hop size.
-        win_length (int): Window length.
-        window (str): Window function type.
-
-    Returns:
-        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
-
-    """
-    if is_pytorch_17plus:
-        x_stft = torch.stft(
-            x, fft_size, hop_size, win_length, window, return_complex=False)
-    else:
-        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
-    real = x_stft[..., 0]
-    imag = x_stft[..., 1]
-
-    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
-    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
-
-
-LRELU_SLOPE = 0.1
-
-
-def get_padding_casual(kernel_size, dilation=1):
-    return int(kernel_size * dilation - dilation)
-
-
-class Conv1dCasual(torch.nn.Module):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 padding_mode='zeros'):
-        super(Conv1dCasual, self).__init__()
-        self.pad = padding
-        self.conv1d = weight_norm(
-            Conv1d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=0,
-                dilation=dilation,
-                groups=groups,
-                bias=bias,
-                padding_mode=padding_mode))
-        self.conv1d.apply(init_weights)
-
-    def forward(self, x):  # bdt
-        # described starting from the last dimension and moving forward.
-        x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
-        x = self.conv1d(x)
-        return x
-
-    def remove_weight_norm(self):
-        remove_weight_norm(self.conv1d)
-
-
-class ConvTranspose1dCausal(torch.nn.Module):
-    """CausalConvTranspose1d module with customized initialization."""
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding=0):
-        """Initialize CausalConvTranspose1d module."""
-        super(ConvTranspose1dCausal, self).__init__()
-        self.deconv = weight_norm(
-            ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
-        self.stride = stride
-        self.deconv.apply(init_weights)
-        self.pad = kernel_size - stride
-
-    def forward(self, x):
-        """Calculate forward propagation.
-        Args:
-            x (Tensor): Input tensor (B, in_channels, T_in).
-        Returns:
-            Tensor: Output tensor (B, out_channels, T_out).
-        """
-        # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
-        return self.deconv(x)[:, :, :-self.pad]
-
-    def remove_weight_norm(self):
-        remove_weight_norm(self.deconv)
-
-
-class ResBlock1(torch.nn.Module):
-
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList([
-            Conv1dCasual(
-                channels,
-                channels,
-                kernel_size,
-                1,
-                dilation=dilation[i],
-                padding=get_padding_casual(kernel_size, dilation[i]))
-            for i in range(len(dilation))
-        ])
-
-        self.convs2 = nn.ModuleList([
-            Conv1dCasual(
-                channels,
-                channels,
-                kernel_size,
-                1,
-                dilation=1,
-                padding=get_padding_casual(kernel_size, 1))
-            for i in range(len(dilation))
-        ])
-
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for layer in self.convs1:
-            layer.remove_weight_norm()
-        for layer in self.convs2:
-            layer.remove_weight_norm()
-
-
-class Generator(torch.nn.Module):
-
-    def __init__(self, h):
-        super(Generator, self).__init__()
-        self.h = h
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-        print('num_kernels={}, num_upsamples={}'.format(
-            self.num_kernels, self.num_upsamples))
-        self.conv_pre = Conv1dCasual(
-            80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
-        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
-
-        self.ups = nn.ModuleList()
-        self.repeat_ups = nn.ModuleList()
-        for i, (u, k) in enumerate(
-                zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            upsample = nn.Sequential(
-                nn.Upsample(mode='nearest', scale_factor=u),
-                nn.LeakyReLU(LRELU_SLOPE),
-                Conv1dCasual(
-                    h.upsample_initial_channel // (2**i),
-                    h.upsample_initial_channel // (2**(i + 1)),
-                    kernel_size=7,
-                    stride=1,
-                    padding=7 - 1))
-            self.repeat_ups.append(upsample)
-            self.ups.append(
-                ConvTranspose1dCausal(
-                    h.upsample_initial_channel // (2**i),
-                    h.upsample_initial_channel // (2**(i + 1)),
-                    k,
-                    u,
-                    padding=(k - u) // 2))
-
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel // (2**(i + 1))
-            for j, (k, d) in enumerate(
-                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
-                self.resblocks.append(resblock(h, ch, k, d))
-
-        self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)
-
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = torch.sin(x) + x
-            # transconv
-            x1 = F.leaky_relu(x, LRELU_SLOPE)
-            x1 = self.ups[i](x1)
-            # repeat
-            x2 = self.repeat_ups[i](x)
-            x = x1 + x2
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for layer in self.ups:
-            layer.remove_weight_norm()
-        for layer in self.repeat_ups:
-            layer[-1].remove_weight_norm()
-        for layer in self.resblocks:
-            layer.remove_weight_norm()
-        self.conv_pre.remove_weight_norm()
-        self.conv_post.remove_weight_norm()
-
-
-class DiscriminatorP(torch.nn.Module):
-
-    def __init__(self,
-                 period,
-                 kernel_size=5,
-                 stride=3,
-                 use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(
-                Conv2d(
-                    1,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    128, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    128,
-                    512, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    512,
-                    1024, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-
-    def forward(self, x):
-        fmap = []
-
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), 'reflect')
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiPeriodDiscriminator(torch.nn.Module):
-
-    def __init__(self):
-        super(MultiPeriodDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorP(2),
-            DiscriminatorP(3),
-            DiscriminatorP(5),
-            DiscriminatorP(7),
-            DiscriminatorP(11),
-        ])
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class DiscriminatorS(torch.nn.Module):
-
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
-            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
-            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
-            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-        ])
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-
-    def forward(self, x):
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiScaleDiscriminator(torch.nn.Module):
-
-    def __init__(self):
-        super(MultiScaleDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorS(use_spectral_norm=True),
-            DiscriminatorS(),
-            DiscriminatorS(),
-        ])
-        from pytorch_wavelets import DWT1DForward
-        self.meanpools = nn.ModuleList(
-            [DWT1DForward(wave='db3', J=1),
-             DWT1DForward(wave='db3', J=1)])
-        self.convs = nn.ModuleList([
-            weight_norm(Conv1d(2, 1, 15, 1, padding=7)),
-            weight_norm(Conv1d(2, 1, 15, 1, padding=7))
-        ])
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            if i != 0:
-                yl, yh = self.meanpools[i - 1](y)
-                y = torch.cat([yl, yh[0]], dim=1)
-                y = self.convs[i - 1](y)
-                y = F.leaky_relu(y, LRELU_SLOPE)
-
-                yl_hat, yh_hat = self.meanpools[i - 1](y_hat)
-                y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1)
-                y_hat = self.convs[i - 1](y_hat)
-                y_hat = F.leaky_relu(y_hat, LRELU_SLOPE)
-
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class DiscriminatorSTFT(torch.nn.Module):
-
-    def __init__(self,
-                 kernel_size=11,
-                 stride=2,
-                 use_spectral_norm=False,
-                 fft_size=1024,
-                 shift_size=120,
-                 win_length=600,
-                 window='hann_window'):
-        super(DiscriminatorSTFT, self).__init__()
-        self.fft_size = fft_size
-        self.shift_size = shift_size
-        self.win_length = win_length
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(
-                Conv2d(
-                    fft_size // 2 + 1,
-                    32, (15, 1), (1, 1),
-                    padding=(get_padding(15, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0)))
-        self.register_buffer('window', getattr(torch, window)(win_length))
-
-    def forward(self, wav):
-        wav = torch.squeeze(wav, 1)
-        x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        x = torch.transpose(x_mag, 2, 1).unsqueeze(-1)
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = x.squeeze(-1)
-
-        return x, fmap
-
-
-class MultiSTFTDiscriminator(torch.nn.Module):
-
-    def __init__(
-        self,
-        fft_sizes=[1024, 2048, 512],
-        hop_sizes=[120, 240, 50],
-        win_lengths=[600, 1200, 240],
-        window='hann_window',
-    ):
-        super(MultiSTFTDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList()
-        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
-            self.discriminators += [
-                DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl)
-            ]
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-def feature_loss(fmap_r, fmap_g):
-    loss = 0
-    for dr, dg in zip(fmap_r, fmap_g):
-        for rl, gl in zip(dr, dg):
-            loss += torch.mean(torch.abs(rl - gl))
-
-    return loss * 2
-
-
-def discriminator_loss(disc_real_outputs, disc_generated_outputs):
-    loss = 0
-    r_losses = []
-    g_losses = []
-    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
-        r_loss = torch.mean((1 - dr)**2)
-        g_loss = torch.mean(dg**2)
-        loss += (r_loss + g_loss)
-        r_losses.append(r_loss.item())
-        g_losses.append(g_loss.item())
-
-    return loss, r_losses, g_losses
-
-
-def generator_loss(disc_outputs):
-    loss = 0
-    gen_losses = []
-    for dg in disc_outputs:
-        temp_loss = torch.mean((1 - dg)**2)
-        gen_losses.append(temp_loss)
-        loss += temp_loss
-
-    return loss, gen_losses
diff --git a/modelscope/models/audio/tts/sambert_hifi.py b/modelscope/models/audio/tts/sambert_hifi.py
index 79f8068e..a9b55795 100644
--- a/modelscope/models/audio/tts/sambert_hifi.py
+++ b/modelscope/models/audio/tts/sambert_hifi.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 import os
@@ -11,13 +13,11 @@ from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import (
     TtsFrontendInitializeFailedException,
-    TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion,
+    TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationException,
     TtsVoiceNotExistsException)
 from modelscope.utils.constant import Tasks
 from .voice import Voice
 
-import tensorflow as tf  # isort:skip
-
 __all__ = ['SambertHifigan']
 
 
@@ -28,14 +28,15 @@ class SambertHifigan(Model):
     def __init__(self, model_dir, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
         if 'am' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing am!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing am!')
         if 'vocoder' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing vocoder!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing vocoder!')
         if 'lang_type' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing lang_type!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing lang_type!'
+            )
         am_cfg = kwargs['am']
         voc_cfg = kwargs['vocoder']
         # initialize frontend
@@ -47,10 +48,12 @@ class SambertHifigan(Model):
             zip_ref.extractall(model_dir)
         if not frontend.initialize(self.__res_path):
             raise TtsFrontendInitializeFailedException(
-                'resource invalid: {}'.format(self.__res_path))
+                'modelscope error: resource invalid: {}'.format(
+                    self.__res_path))
         if not frontend.set_lang_type(kwargs['lang_type']):
             raise TtsFrontendLanguageTypeInvalidException(
-                'language type invalid: {}'.format(kwargs['lang_type']))
+                'modelscope error: language type invalid: {}'.format(
+                    kwargs['lang_type']))
         self.__frontend = frontend
         zip_file = os.path.join(model_dir, 'voices.zip')
         self.__voice_path = os.path.join(model_dir, 'voices')
@@ -60,7 +63,8 @@ class SambertHifigan(Model):
         with open(voice_cfg_path, 'r') as f:
             voice_cfg = json.load(f)
         if 'voices' not in voice_cfg:
-            raise TtsModelConfigurationExcetion('voices invalid')
+            raise TtsModelConfigurationException(
+                'modelscope error: voices invalid')
         self.__voice = {}
         for name in voice_cfg['voices']:
             voice_path = os.path.join(self.__voice_path, name)
@@ -70,11 +74,13 @@ class SambertHifigan(Model):
         if voice_cfg['voices']:
             self.__default_voice_name = voice_cfg['voices'][0]
         else:
-            raise TtsVoiceNotExistsException('voices is empty in voices.json')
+            raise TtsVoiceNotExistsException(
+                'modelscope error: voices is empty in voices.json')
 
     def __synthesis_one_sentences(self, voice_name, text):
         if voice_name not in self.__voice:
-            raise TtsVoiceNotExistsException(f'Voice {voice_name} not exists')
+            raise TtsVoiceNotExistsException(
+                f'modelscope error: Voice {voice_name} not exists')
         return self.__voice[voice_name].forward(text)
 
     def forward(self, text: str, voice_name: str = None):
diff --git a/modelscope/models/audio/tts/text/cleaners.py b/modelscope/models/audio/tts/text/cleaners.py
deleted file mode 100755
index 19d838d1..00000000
--- a/modelscope/models/audio/tts/text/cleaners.py
+++ /dev/null
@@ -1,89 +0,0 @@
-'''
-Cleaners are transformations that run over the input text at both training and eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-'''
-
-import re
-
-from unidecode import unidecode
-
-from .numbers import normalize_numbers
-
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
-
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-                  for x in [
-                      ('mrs', 'misess'),
-                      ('mr', 'mister'),
-                      ('dr', 'doctor'),
-                      ('st', 'saint'),
-                      ('co', 'company'),
-                      ('jr', 'junior'),
-                      ('maj', 'major'),
-                      ('gen', 'general'),
-                      ('drs', 'doctors'),
-                      ('rev', 'reverend'),
-                      ('lt', 'lieutenant'),
-                      ('hon', 'honorable'),
-                      ('sgt', 'sergeant'),
-                      ('capt', 'captain'),
-                      ('esq', 'esquire'),
-                      ('ltd', 'limited'),
-                      ('col', 'colonel'),
-                      ('ft', 'fort'), ]]  # yapf:disable
-
-
-def expand_abbreviations(text):
-    for regex, replacement in _abbreviations:
-        text = re.sub(regex, replacement, text)
-    return text
-
-
-def expand_numbers(text):
-    return normalize_numbers(text)
-
-
-def lowercase(text):
-    return text.lower()
-
-
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, ' ', text)
-
-
-def convert_to_ascii(text):
-    return unidecode(text)
-
-
-def basic_cleaners(text):
-    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def transliteration_cleaners(text):
-    '''Pipeline for non-English text that transliterates to ASCII.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def english_cleaners(text):
-    '''Pipeline for English text, including number and abbreviation expansion.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = expand_numbers(text)
-    text = expand_abbreviations(text)
-    text = collapse_whitespace(text)
-    return text
diff --git a/modelscope/models/audio/tts/text/cmudict.py b/modelscope/models/audio/tts/text/cmudict.py
deleted file mode 100755
index b4da4be9..00000000
--- a/modelscope/models/audio/tts/text/cmudict.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import re
-
-valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
-    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
-    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
-    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
-    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
-    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
-    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
-    'Y', 'Z', 'ZH'
-]
-
-_valid_symbol_set = set(valid_symbols)
-
-
-class CMUDict:
-    '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
-
-    def __init__(self, file_or_path, keep_ambiguous=True):
-        if isinstance(file_or_path, str):
-            with open(file_or_path, encoding='latin-1') as f:
-                entries = _parse_cmudict(f)
-        else:
-            entries = _parse_cmudict(file_or_path)
-        if not keep_ambiguous:
-            entries = {
-                word: pron
-                for word, pron in entries.items() if len(pron) == 1
-            }
-        self._entries = entries
-
-    def __len__(self):
-        return len(self._entries)
-
-    def lookup(self, word):
-        '''Returns list of ARPAbet pronunciations of the given word.'''
-        return self._entries.get(word.upper())
-
-
-_alt_re = re.compile(r'\([0-9]+\)')
-
-
-def _parse_cmudict(file):
-    cmudict = {}
-    for line in file:
-        if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
-            parts = line.split('  ')
-            word = re.sub(_alt_re, '', parts[0])
-            pronunciation = _get_pronunciation(parts[1])
-            if pronunciation:
-                if word in cmudict:
-                    cmudict[word].append(pronunciation)
-                else:
-                    cmudict[word] = [pronunciation]
-    return cmudict
-
-
-def _get_pronunciation(s):
-    parts = s.strip().split(' ')
-    for part in parts:
-        if part not in _valid_symbol_set:
-            return None
-    return ' '.join(parts)
diff --git a/modelscope/models/audio/tts/text/symbols.py b/modelscope/models/audio/tts/text/symbols.py
deleted file mode 100644
index 63975abb..00000000
--- a/modelscope/models/audio/tts/text/symbols.py
+++ /dev/null
@@ -1,105 +0,0 @@
-'''
-Defines the set of symbols used in text input to the model.
-
-The default is a set of ASCII characters that works well for English or text that has been run
-through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
-'''
-import codecs
-import os
-
-_pad = '_'
-_eos = '~'
-_mask = '@[MASK]'
-
-
-def load_symbols(dict_path, has_mask=True):
-    _characters = ''
-    _ch_symbols = []
-    sy_dict_name = 'sy_dict.txt'
-    sy_dict_path = os.path.join(dict_path, sy_dict_name)
-    f = codecs.open(sy_dict_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_symbols.append(line)
-
-    _arpabet = ['@' + s for s in _ch_symbols]
-
-    # Export all symbols:
-    sy = list(_characters) + _arpabet + [_pad, _eos]
-    if has_mask:
-        sy.append(_mask)
-
-    _characters = ''
-
-    _ch_tones = []
-    tone_dict_name = 'tone_dict.txt'
-    tone_dict_path = os.path.join(dict_path, tone_dict_name)
-    f = codecs.open(tone_dict_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_tones.append(line)
-
-    # Export all tones:
-    tone = list(_characters) + _ch_tones + [_pad, _eos]
-    if has_mask:
-        tone.append(_mask)
-
-    _characters = ''
-
-    _ch_syllable_flags = []
-    syllable_flag_name = 'syllable_flag_dict.txt'
-    syllable_flag_path = os.path.join(dict_path, syllable_flag_name)
-    f = codecs.open(syllable_flag_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_syllable_flags.append(line)
-
-    # Export all syllable_flags:
-    syllable_flag = list(_characters) + _ch_syllable_flags + [_pad, _eos]
-    if has_mask:
-        syllable_flag.append(_mask)
-
-    _characters = ''
-
-    _ch_word_segments = []
-    word_segment_name = 'word_segment_dict.txt'
-    word_segment_path = os.path.join(dict_path, word_segment_name)
-    f = codecs.open(word_segment_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_word_segments.append(line)
-
-    # Export all syllable_flags:
-    word_segment = list(_characters) + _ch_word_segments + [_pad, _eos]
-    if has_mask:
-        word_segment.append(_mask)
-
-    _characters = ''
-
-    _ch_emo_types = []
-    emo_category_name = 'emo_category_dict.txt'
-    emo_category_path = os.path.join(dict_path, emo_category_name)
-    f = codecs.open(emo_category_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_emo_types.append(line)
-
-    emo_category = list(_characters) + _ch_emo_types + [_pad, _eos]
-    if has_mask:
-        emo_category.append(_mask)
-
-    _characters = ''
-
-    _ch_speakers = []
-    speaker_name = 'speaker_dict.txt'
-    speaker_path = os.path.join(dict_path, speaker_name)
-    f = codecs.open(speaker_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_speakers.append(line)
-
-    # Export all syllable_flags:
-    speaker = list(_characters) + _ch_speakers + [_pad, _eos]
-    if has_mask:
-        speaker.append(_mask)
-    return sy, tone, syllable_flag, word_segment, emo_category, speaker
diff --git a/modelscope/models/audio/tts/text/symbols_dict.py b/modelscope/models/audio/tts/text/symbols_dict.py
deleted file mode 100644
index e8f7ed19..00000000
--- a/modelscope/models/audio/tts/text/symbols_dict.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import re
-import sys
-
-from .cleaners import (basic_cleaners, english_cleaners,
-                       transliteration_cleaners)
-
-
-class SymbolsDict:
-
-    def __init__(self, sy, tone, syllable_flag, word_segment, emo_category,
-                 speaker, inputs_dim, lfeat_type_list):
-        self._inputs_dim = inputs_dim
-        self._lfeat_type_list = lfeat_type_list
-        self._sy_to_id = {s: i for i, s in enumerate(sy)}
-        self._id_to_sy = {i: s for i, s in enumerate(sy)}
-        self._tone_to_id = {s: i for i, s in enumerate(tone)}
-        self._id_to_tone = {i: s for i, s in enumerate(tone)}
-        self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)}
-        self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)}
-        self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)}
-        self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)}
-        self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)}
-        self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)}
-        self._speaker_to_id = {s: i for i, s in enumerate(speaker)}
-        self._id_to_speaker = {i: s for i, s in enumerate(speaker)}
-        print('_sy_to_id: ')
-        print(self._sy_to_id)
-        print('_tone_to_id: ')
-        print(self._tone_to_id)
-        print('_syllable_flag_to_id: ')
-        print(self._syllable_flag_to_id)
-        print('_word_segment_to_id: ')
-        print(self._word_segment_to_id)
-        print('_emo_category_to_id: ')
-        print(self._emo_category_to_id)
-        print('_speaker_to_id: ')
-        print(self._speaker_to_id)
-        self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
-        self._cleaners = {
-            basic_cleaners.__name__: basic_cleaners,
-            transliteration_cleaners.__name__: transliteration_cleaners,
-            english_cleaners.__name__: english_cleaners
-        }
-
-    def _clean_text(self, text, cleaner_names):
-        for name in cleaner_names:
-            cleaner = self._cleaners.get(name)
-            if not cleaner:
-                raise Exception('Unknown cleaner: %s' % name)
-            text = cleaner(text)
-        return text
-
-    def _sy_to_sequence(self, sy):
-        return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)]
-
-    def _arpabet_to_sequence(self, text):
-        return self._sy_to_sequence(['@' + s for s in text.split()])
-
-    def _should_keep_sy(self, s):
-        return s in self._sy_to_id and s != '_' and s != '~'
-
-    def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names):
-        sequence = []
-        if lfeat_type == 'sy':
-            this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
-            this_lfeat_symbol_format = ''
-            index = 0
-            while index < len(this_lfeat_symbol):
-                this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
-                    index] + '}' + ' '
-                index = index + 1
-            sequence = self.text_to_sequence(this_lfeat_symbol_format,
-                                             cleaner_names)
-        elif lfeat_type == 'tone':
-            sequence = self.tone_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'syllable_flag':
-            sequence = self.syllable_flag_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'word_segment':
-            sequence = self.word_segment_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'emo_category':
-            sequence = self.emo_category_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'speaker':
-            sequence = self.speaker_to_sequence(this_lfeat_symbol)
-        else:
-            raise Exception('Unknown lfeat type: %s' % lfeat_type)
-
-        return sequence
-
-    def text_to_sequence(self, text, cleaner_names):
-        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-
-          The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-          in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-
-          Args:
-            text: string to convert to a sequence
-            cleaner_names: names of the cleaner functions to run the text through
-
-          Returns:
-            List of integers corresponding to the symbols in the text
-        '''
-        sequence = []
-
-        # Check for curly braces and treat their contents as ARPAbet:
-        while len(text):
-            m = self._curly_re.match(text)
-            if not m:
-                sequence += self._sy_to_sequence(
-                    self._clean_text(text, cleaner_names))
-                break
-            sequence += self._sy_to_sequence(
-                self._clean_text(m.group(1), cleaner_names))
-            sequence += self._arpabet_to_sequence(m.group(2))
-            text = m.group(3)
-
-        # Append EOS token
-        sequence.append(self._sy_to_id['~'])
-        return sequence
-
-    def tone_to_sequence(self, tone):
-        tones = tone.strip().split(' ')
-        sequence = []
-        for this_tone in tones:
-            sequence.append(self._tone_to_id[this_tone])
-        sequence.append(self._tone_to_id['~'])
-        return sequence
-
-    def syllable_flag_to_sequence(self, syllable_flag):
-        syllable_flags = syllable_flag.strip().split(' ')
-        sequence = []
-        for this_syllable_flag in syllable_flags:
-            sequence.append(self._syllable_flag_to_id[this_syllable_flag])
-        sequence.append(self._syllable_flag_to_id['~'])
-        return sequence
-
-    def word_segment_to_sequence(self, word_segment):
-        word_segments = word_segment.strip().split(' ')
-        sequence = []
-        for this_word_segment in word_segments:
-            sequence.append(self._word_segment_to_id[this_word_segment])
-        sequence.append(self._word_segment_to_id['~'])
-        return sequence
-
-    def emo_category_to_sequence(self, emo_type):
-        emo_categories = emo_type.strip().split(' ')
-        sequence = []
-        for this_category in emo_categories:
-            sequence.append(self._emo_category_to_id[this_category])
-        sequence.append(self._emo_category_to_id['~'])
-        return sequence
-
-    def speaker_to_sequence(self, speaker):
-        speakers = speaker.strip().split(' ')
-        sequence = []
-        for this_speaker in speakers:
-            sequence.append(self._speaker_to_id[this_speaker])
-        sequence.append(self._speaker_to_id['~'])
-        return sequence
-
-    def sequence_to_symbol(self, sequence):
-        result = ''
-        pre_lfeat_dim = 0
-        for lfeat_type in self._lfeat_type_list:
-            current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim
-                                                + self._inputs_dim[lfeat_type]]
-            current_sequence = current_one_hot_sequence.argmax(1)
-            length = current_sequence.shape[0]
-
-            index = 0
-            while index < length:
-                this_sequence = current_sequence[index]
-                s = ''
-                if lfeat_type == 'sy':
-                    s = self._id_to_sy[this_sequence]
-                    if len(s) > 1 and s[0] == '@':
-                        s = s[1:]
-                elif lfeat_type == 'tone':
-                    s = self._id_to_tone[this_sequence]
-                elif lfeat_type == 'syllable_flag':
-                    s = self._id_to_syllable_flag[this_sequence]
-                elif lfeat_type == 'word_segment':
-                    s = self._id_to_word_segment[this_sequence]
-                elif lfeat_type == 'emo_category':
-                    s = self._id_to_emo_category[this_sequence]
-                elif lfeat_type == 'speaker':
-                    s = self._id_to_speaker[this_sequence]
-                else:
-                    raise Exception('Unknown lfeat type: %s' % lfeat_type)
-
-                if index == 0:
-                    result = result + lfeat_type + ': '
-
-                result = result + '{' + s + '}'
-
-                if index == length - 1:
-                    result = result + '; '
-
-                index = index + 1
-            pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type]
-        return result
diff --git a/modelscope/models/audio/tts/voice.py b/modelscope/models/audio/tts/voice.py
index deaebf11..dc830db5 100644
--- a/modelscope/models/audio/tts/voice.py
+++ b/modelscope/models/audio/tts/voice.py
@@ -1,286 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
+import pickle as pkl
 
 import json
 import numpy as np
 import torch
-from sklearn.preprocessing import MultiLabelBinarizer
 
+from modelscope.utils.audio.tts_exceptions import \
+    TtsModelConfigurationException
 from modelscope.utils.constant import ModelFile, Tasks
-from .models import Generator, create_am_model
-from .text.symbols import load_symbols
-from .text.symbols_dict import SymbolsDict
-
-import tensorflow as tf  # isort:skip
+from .models.datasets.units import KanTtsLinguisticUnit
+from .models.models.hifigan import Generator
+from .models.models.sambert import KanTtsSAMBERT
+from .models.utils import (AttrDict, build_env, init_weights, load_checkpoint,
+                           plot_spectrogram, save_checkpoint, scan_checkpoint)
 
 MAX_WAV_VALUE = 32768.0
 
 
-def multi_label_symbol_to_sequence(my_classes, my_symbol):
-    one_hot = MultiLabelBinarizer(classes=my_classes)
-    tokens = my_symbol.strip().split(' ')
-    sequences = []
-    for token in tokens:
-        sequences.append(tuple(token.split('&')))
-    return one_hot.fit_transform(sequences)
-
-
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    return checkpoint_dict
-
-
-class AttrDict(dict):
-
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-
-
 class Voice:
 
-    def __init__(self, voice_name, voice_path, am_hparams, voc_config):
+    def __init__(self, voice_name, voice_path, am_config, voc_config):
         self.__voice_name = voice_name
         self.__voice_path = voice_path
-        self.__am_hparams = tf.contrib.training.HParams(**am_hparams)
+        self.__am_config = AttrDict(**am_config)
         self.__voc_config = AttrDict(**voc_config)
         self.__model_loaded = False
+        if 'am' not in self.__am_config:
+            raise TtsModelConfigurationException(
+                'modelscope error: am configuration invalid')
+        if 'linguistic_unit' not in self.__am_config:
+            raise TtsModelConfigurationException(
+                'modelscope error: am configuration invalid')
+        self.__am_lingustic_unit_config = self.__am_config['linguistic_unit']
 
     def __load_am(self):
-        local_am_ckpt_path = os.path.join(self.__voice_path,
-                                          ModelFile.TF_CHECKPOINT_FOLDER)
-        self.__am_ckpt_path = os.path.join(local_am_ckpt_path, 'ckpt')
-        self.__dict_path = os.path.join(self.__voice_path, 'dicts')
+        local_am_ckpt_path = os.path.join(self.__voice_path, 'am')
+        self.__am_ckpt_path = os.path.join(local_am_ckpt_path,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
         has_mask = True
-        if self.__am_hparams.get('has_mask') is not None:
-            has_mask = self.__am_hparams.has_mask
-        model_name = 'robutrans'
-        self.__lfeat_type_list = self.__am_hparams.lfeat_type_list.strip(
-        ).split(',')
-        sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols(
-            self.__dict_path, has_mask)
-        self.__sy = sy
-        self.__tone = tone
-        self.__syllable_flag = syllable_flag
-        self.__word_segment = word_segment
-        self.__emo_category = emo_category
-        self.__speaker = speaker
-        self.__inputs_dim = dict()
-        for lfeat_type in self.__lfeat_type_list:
-            if lfeat_type == 'sy':
-                self.__inputs_dim[lfeat_type] = len(sy)
-            elif lfeat_type == 'tone':
-                self.__inputs_dim[lfeat_type] = len(tone)
-            elif lfeat_type == 'syllable_flag':
-                self.__inputs_dim[lfeat_type] = len(syllable_flag)
-            elif lfeat_type == 'word_segment':
-                self.__inputs_dim[lfeat_type] = len(word_segment)
-            elif lfeat_type == 'emo_category':
-                self.__inputs_dim[lfeat_type] = len(emo_category)
-            elif lfeat_type == 'speaker':
-                self.__inputs_dim[lfeat_type] = len(speaker)
-
-        self.__symbols_dict = SymbolsDict(sy, tone, syllable_flag,
-                                          word_segment, emo_category, speaker,
-                                          self.__inputs_dim,
-                                          self.__lfeat_type_list)
-        dim_inputs = sum(self.__inputs_dim.values(
-        )) - self.__inputs_dim['speaker'] - self.__inputs_dim['emo_category']
-        self.__graph = tf.Graph()
-        with self.__graph.as_default():
-            inputs = tf.placeholder(tf.float32, [1, None, dim_inputs],
-                                    'inputs')
-            inputs_emotion = tf.placeholder(
-                tf.float32, [1, None, self.__inputs_dim['emo_category']],
-                'inputs_emotion')
-            inputs_speaker = tf.placeholder(
-                tf.float32, [1, None, self.__inputs_dim['speaker']],
-                'inputs_speaker')
-            input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
-            pitch_contours_scale = tf.placeholder(tf.float32, [1, None],
-                                                  'pitch_contours_scale')
-            energy_contours_scale = tf.placeholder(tf.float32, [1, None],
-                                                   'energy_contours_scale')
-            duration_scale = tf.placeholder(tf.float32, [1, None],
-                                            'duration_scale')
-            with tf.variable_scope('model') as _:
-                self.__model = create_am_model(model_name, self.__am_hparams)
-                self.__model.initialize(
-                    inputs,
-                    inputs_emotion,
-                    inputs_speaker,
-                    input_lengths,
-                    duration_scales=duration_scale,
-                    pitch_scales=pitch_contours_scale,
-                    energy_scales=energy_contours_scale)
-                self.__mel_spec = self.__model.mel_outputs[0]
-                self.__duration_outputs = self.__model.duration_outputs[0]
-                self.__duration_outputs_ = self.__model.duration_outputs_[0]
-                self.__pitch_contour_outputs = self.__model.pitch_contour_outputs[
-                    0]
-                self.__energy_contour_outputs = self.__model.energy_contour_outputs[
-                    0]
-                self.__embedded_inputs_emotion = self.__model.embedded_inputs_emotion[
-                    0]
-                self.__embedding_fsmn_outputs = self.__model.embedding_fsmn_outputs[
-                    0]
-                self.__encoder_outputs = self.__model.encoder_outputs[0]
-                self.__pitch_embeddings = self.__model.pitch_embeddings[0]
-                self.__energy_embeddings = self.__model.energy_embeddings[0]
-                self.__LR_outputs = self.__model.LR_outputs[0]
-                self.__postnet_fsmn_outputs = self.__model.postnet_fsmn_outputs[
-                    0]
-                self.__attention_h = self.__model.attention_h
-                self.__attention_x = self.__model.attention_x
-
-                config = tf.ConfigProto()
-                config.gpu_options.allow_growth = True
-                self.__session = tf.Session(config=config)
-                self.__session.run(tf.global_variables_initializer())
-
-                saver = tf.train.Saver()
-                saver.restore(self.__session, self.__am_ckpt_path)
+        if 'has_mask' in self.__am_lingustic_unit_config:
+            has_mask = self.__am_lingustic_unit_config.has_mask
+        self.__ling_unit = KanTtsLinguisticUnit(
+            self.__am_lingustic_unit_config, self.__voice_path, has_mask)
+        self.__am_net = KanTtsSAMBERT(self.__am_config,
+                                      self.__ling_unit.get_unit_size()).to(
+                                          self.__device)
+        state_dict_g = {}
+        try:
+            state_dict_g = load_checkpoint(self.__am_ckpt_path, self.__device)
+        except RuntimeError:
+            with open(self.__am_ckpt_path, 'rb') as f:
+                pth_var_dict = pkl.load(f)
+                state_dict_g['fsnet'] = {
+                    k: torch.FloatTensor(v)
+                    for k, v in pth_var_dict['fsnet'].items()
+                }
+        self.__am_net.load_state_dict(state_dict_g['fsnet'], strict=False)
+        self.__am_net.eval()
 
     def __load_vocoder(self):
-        self.__voc_ckpt_path = os.path.join(self.__voice_path,
+        local_voc_ckpy_path = os.path.join(self.__voice_path, 'vocoder')
+        self.__voc_ckpt_path = os.path.join(local_voc_ckpy_path,
                                             ModelFile.TORCH_MODEL_BIN_FILE)
-        if torch.cuda.is_available():
-            torch.manual_seed(self.__voc_config.seed)
-            self.__device = torch.device('cuda')
-        else:
-            self.__device = torch.device('cpu')
         self.__generator = Generator(self.__voc_config).to(self.__device)
         state_dict_g = load_checkpoint(self.__voc_ckpt_path, self.__device)
         self.__generator.load_state_dict(state_dict_g['generator'])
         self.__generator.eval()
         self.__generator.remove_weight_norm()
 
-    def __am_forward(self,
-                     text,
-                     pitch_control_str='',
-                     duration_control_str='',
-                     energy_control_str=''):
-        duration_cfg_lst = []
-        if len(duration_control_str) != 0:
-            for item in duration_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                duration_cfg_lst.append((float(percent), float(scale)))
-        pitch_contours_cfg_lst = []
-        if len(pitch_control_str) != 0:
-            for item in pitch_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                pitch_contours_cfg_lst.append((float(percent), float(scale)))
-        energy_contours_cfg_lst = []
-        if len(energy_control_str) != 0:
-            for item in energy_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                energy_contours_cfg_lst.append((float(percent), float(scale)))
-        cleaner_names = [
-            x.strip() for x in self.__am_hparams.cleaners.split(',')
-        ]
-
-        lfeat_symbol = text.strip().split(' ')
-        lfeat_symbol_separate = [''] * int(len(self.__lfeat_type_list))
-        for this_lfeat_symbol in lfeat_symbol:
-            this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
-                '$')
-            if len(this_lfeat_symbol) != len(self.__lfeat_type_list):
-                raise Exception(
-                    'Length of this_lfeat_symbol in training data'
-                    + ' is not equal to the length of lfeat_type_list, '
-                    + str(len(this_lfeat_symbol)) + ' VS. '
-                    + str(len(self.__lfeat_type_list)))
-            index = 0
-            while index < len(lfeat_symbol_separate):
-                lfeat_symbol_separate[index] = lfeat_symbol_separate[
-                    index] + this_lfeat_symbol[index] + ' '
-                index = index + 1
-
-        index = 0
-        lfeat_type = self.__lfeat_type_list[index]
-        sequence = self.__symbols_dict.symbol_to_sequence(
-            lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names)
-        sequence_array = np.asarray(
-            sequence[:-1],
-            dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
-        inputs = np.eye(
-            self.__inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
-        index = index + 1
-        while index < len(self.__lfeat_type_list) - 2:
-            lfeat_type = self.__lfeat_type_list[index]
-            sequence = self.__symbols_dict.symbol_to_sequence(
-                lfeat_symbol_separate[index].strip(), lfeat_type,
-                cleaner_names)
-            sequence_array = np.asarray(
-                sequence[:-1],
-                dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
-            inputs_temp = np.eye(
-                self.__inputs_dim[lfeat_type],
-                dtype=np.float32)[sequence_array]
-            inputs = np.concatenate((inputs, inputs_temp), axis=1)
-            index = index + 1
-        seq = inputs
-
-        lfeat_type = 'emo_category'
-        inputs_emotion = multi_label_symbol_to_sequence(
-            self.__emo_category, lfeat_symbol_separate[index].strip())
-        # inputs_emotion = inputs_emotion * 1.5
-        index = index + 1
-
-        lfeat_type = 'speaker'
-        inputs_speaker = multi_label_symbol_to_sequence(
-            self.__speaker, lfeat_symbol_separate[index].strip())
-
-        duration_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in duration_cfg_lst:
-            duration_scale[start_idx:start_idx
-                           + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in pitch_contours_cfg_lst:
-            pitch_contours_scale[start_idx:start_idx
-                                 + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        energy_contours_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in energy_contours_cfg_lst:
-            energy_contours_scale[start_idx:start_idx
-                                  + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        feed_dict = {
-            self.__model.inputs: [np.asarray(seq, dtype=np.float32)],
-            self.__model.inputs_emotion:
-            [np.asarray(inputs_emotion, dtype=np.float32)],
-            self.__model.inputs_speaker:
-            [np.asarray(inputs_speaker, dtype=np.float32)],
-            self.__model.input_lengths:
-            np.asarray([len(seq)], dtype=np.int32),
-            self.__model.duration_scales: [duration_scale],
-            self.__model.pitch_scales: [pitch_contours_scale],
-            self.__model.energy_scales: [energy_contours_scale]
-        }
-
-        result = self.__session.run([
-            self.__mel_spec, self.__duration_outputs, self.__duration_outputs_,
-            self.__pitch_contour_outputs, self.__embedded_inputs_emotion,
-            self.__embedding_fsmn_outputs, self.__encoder_outputs,
-            self.__pitch_embeddings, self.__LR_outputs,
-            self.__postnet_fsmn_outputs, self.__energy_contour_outputs,
-            self.__energy_embeddings, self.__attention_x, self.__attention_h
-        ], feed_dict=feed_dict)  # yapf:disable
-        return result[0]
+    def __am_forward(self, symbol_seq):
+        with torch.no_grad():
+            inputs_feat_lst = self.__ling_unit.encode_symbol_sequence(
+                symbol_seq)
+            inputs_sy = torch.from_numpy(inputs_feat_lst[0]).long().to(
+                self.__device)
+            inputs_tone = torch.from_numpy(inputs_feat_lst[1]).long().to(
+                self.__device)
+            inputs_syllable = torch.from_numpy(inputs_feat_lst[2]).long().to(
+                self.__device)
+            inputs_ws = torch.from_numpy(inputs_feat_lst[3]).long().to(
+                self.__device)
+            inputs_ling = torch.stack(
+                [inputs_sy, inputs_tone, inputs_syllable, inputs_ws],
+                dim=-1).unsqueeze(0)
+            inputs_emo = torch.from_numpy(inputs_feat_lst[4]).long().to(
+                self.__device).unsqueeze(0)
+            inputs_spk = torch.from_numpy(inputs_feat_lst[5]).long().to(
+                self.__device).unsqueeze(0)
+            inputs_len = torch.zeros(1).to(self.__device).long(
+            ) + inputs_emo.size(1) - 1  # minus 1 for "~"
+            res = self.__am_net(inputs_ling[:, :-1, :], inputs_emo[:, :-1],
+                                inputs_spk[:, :-1], inputs_len)
+            postnet_outputs = res['postnet_outputs']
+            LR_length_rounded = res['LR_length_rounded']
+            valid_length = int(LR_length_rounded[0].item())
+            postnet_outputs = postnet_outputs[
+                0, :valid_length, :].cpu().numpy()
+            return postnet_outputs
 
     def __vocoder_forward(self, melspec):
         dim0 = list(melspec.shape)[-1]
         if dim0 != self.__voc_config.num_mels:
             raise TtsVocoderMelspecShapeMismatchException(
-                'input melspec mismatch require {} but {}'.format(
-                    self.__voc_config.num_mels, dim0))
+                'modelscope error: input melspec mismatch require {} but {}'.
+                format(self.__voc_config.num_mels, dim0))
         with torch.no_grad():
             x = melspec.T
             x = torch.FloatTensor(x).to(self.__device)
@@ -292,9 +117,15 @@ class Voice:
             audio = audio.cpu().numpy().astype('int16')
             return audio
 
-    def forward(self, text):
+    def forward(self, symbol_seq):
         if not self.__model_loaded:
+            torch.manual_seed(self.__am_config.seed)
+            if torch.cuda.is_available():
+                torch.manual_seed(self.__am_config.seed)
+                self.__device = torch.device('cuda')
+            else:
+                self.__device = torch.device('cpu')
             self.__load_am()
             self.__load_vocoder()
             self.__model_loaded = True
-        return self.__vocoder_forward(self.__am_forward(text))
+        return self.__vocoder_forward(self.__am_forward(symbol_seq))
diff --git a/modelscope/pipelines/audio/text_to_speech_pipeline.py b/modelscope/pipelines/audio/text_to_speech_pipeline.py
index f9e7d80a..2063da68 100644
--- a/modelscope/pipelines/audio/text_to_speech_pipeline.py
+++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, List
 
 import numpy as np
@@ -42,3 +44,6 @@ class TextToSpeechSambertHifiganPipeline(Pipeline):
 
     def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         return inputs
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {}, pipeline_parameters, {}
diff --git a/modelscope/utils/audio/tts_exceptions.py b/modelscope/utils/audio/tts_exceptions.py
index 8c73b603..43ec994b 100644
--- a/modelscope/utils/audio/tts_exceptions.py
+++ b/modelscope/utils/audio/tts_exceptions.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 """
 Define TTS exceptions
 """
@@ -10,7 +11,7 @@ class TtsException(Exception):
     pass
 
 
-class TtsModelConfigurationExcetion(TtsException):
+class TtsModelConfigurationException(TtsException):
     """
     TTS model configuration exceptions.
     """
diff --git a/requirements/audio.txt b/requirements/audio.txt
index 5e4bc104..d22ad8f1 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,6 +1,5 @@
 easyasr>=0.0.2
 espnet>=202204
-#tts
 h5py
 inflect
 keras
@@ -15,11 +14,7 @@ nltk
 numpy<=1.18
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>3,<3.21.0
-ptflops
 py_sound_connect
-pytorch_wavelets
-PyWavelets>=1.0.0
-scikit-learn
 SoundFile>0.10
 sox
 torchaudio
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index e82cf43e..f659e59b 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -9,6 +9,7 @@ import unittest
 import torch
 from scipy.io.wavfile import write
 
+from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -33,7 +34,9 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
         text = '今天北京天气怎么样？'
         voice = 'zhitian_emo'
 
-        sambert_hifigan_tts = pipeline(task=self.task, model=self.model_id)
+        model = Model.from_pretrained(
+            model_name_or_path=self.model_id, revision='pytorch_am')
+        sambert_hifigan_tts = pipeline(task=self.task, model=model)
         self.assertTrue(sambert_hifigan_tts is not None)
         output = sambert_hifigan_tts(input=text, voice=voice)
         self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])

From b98114367bb8f3e383cb101d329cc85481264ee3 Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Tue, 27 Sep 2022 22:15:24 +0800
Subject: [PATCH 12/23] [to #42322933]add timestamp for movie scene
 segmentation output         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10269467

    * add timestamp for movie scene segmentation output
---
 .../models/audio/tts/models/datasets/__init__.py     |  0
 .../cv/movie_scene_segmentation/utils/save_op.py     | 12 ++++++++----
 modelscope/outputs.py                                |  3 ++-
 3 files changed, 10 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 modelscope/models/audio/tts/models/datasets/__init__.py

diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100644
new mode 100755
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
index 6361c056..b350ff13 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -26,7 +26,8 @@ def pred2scene(shot2keyf, anno_dict):
     for scene_ind, scene_item in enumerate(scene_list):
         scene_dict_lst.append({
             'shot': pair_list[scene_ind],
-            'frame': scene_item
+            'frame': scene_item[0],
+            'timestamp': scene_item[1]
         })
 
     return scene_dict_lst, scene_list
@@ -42,8 +43,8 @@ def scene2video(source_movie_fn, scene_list, thres):
 
     for scene_ind, scene_item in tqdm(enumerate(scene_list)):
         scene = str(scene_ind).zfill(4)
-        start_frame = int(scene_item[0])
-        end_frame = int(scene_item[1])
+        start_frame = int(scene_item[0][0])
+        end_frame = int(scene_item[0][1])
         start_time, end_time = start_frame / fps, end_frame / fps
         duration_time = end_time - start_time
         out_video_fn = os.path.join(out_video_dir_fn,
@@ -71,7 +72,10 @@ def get_demo_scene_list(shot2keyf, anno_dict):
         start_shot, end_shot = int(pair[0]), int(pair[-1])
         start_frame = shot2keyf[start_shot].split(' ')[0]
         end_frame = shot2keyf[end_shot].split(' ')[1]
-        scene_list.append((start_frame, end_frame))
+        start_timestamp = shot2keyf[start_shot].split(' ')[-2]
+        end_timestamp = shot2keyf[end_shot].split(' ')[-1]
+        scene_list.append([[start_frame, end_frame],
+                           [start_timestamp, end_timestamp]])
     return scene_list, pair_list
 
 
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index b19f7e43..d80ba9c5 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -303,7 +303,8 @@ TASK_OUTPUTS = {
     #        [
     #           {
     #               "shot": [0,1,2],
-    #               "frame": [start_frame, end_frame]
+    #               "frame": [start_frame, end_frame],
+    #               "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
     #           }
     #        ]
     #

From 939a9f232242684dc86f463ac294c14beaa99f3e Mon Sep 17 00:00:00 2001
From: "wendi.hwd" <wendi.hwd@alibaba-inc.com>
Date: Tue, 27 Sep 2022 22:17:41 +0800
Subject: [PATCH 13/23] [to #42322933]fix commits         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10272768

---
 modelscope/outputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index d80ba9c5..92e3410b 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -153,7 +153,7 @@ TASK_OUTPUTS = {
 
     # semantic segmentation result for single sample
     #   {
-    #       "masks": [np.array # 2D array containing only 0, 255]
+    #       "masks": [np.array # 2D array with shape [height, width]]
     #   }
     Tasks.semantic_segmentation: [OutputKeys.MASKS],
 

From 744c84c89302728d0d6bfaca411d00abdee5b310 Mon Sep 17 00:00:00 2001
From: "lanjinpeng.ljp" <lanjinpeng.ljp@alibaba-inc.com>
Date: Tue, 27 Sep 2022 22:19:14 +0800
Subject: [PATCH 14/23] output timestamps for video-single-object-tracking demo
 service
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

830版本 video-single-object-tracking demo需要输出timestamps信息
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10278969
---
 .../cv/video_single_object_tracking/utils/utils.py    |  7 +++++++
 modelscope/outputs.py                                 |  6 ++++--
 .../cv/video_single_object_tracking_pipeline.py       | 11 +++++++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/modelscope/models/cv/video_single_object_tracking/utils/utils.py b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
index 752ec272..90513a2a 100644
--- a/modelscope/models/cv/video_single_object_tracking/utils/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
@@ -238,3 +238,10 @@ def check_box(box: list, image_height, image_width) -> bool:
     if box[3] < 0 or box[3] >= image_height:
         return False
     return True
+
+
+def timestamp_format(seconds):
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    time = '%02d:%02d:%06.3f' % (h, m, s)
+    return time
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 92e3410b..b96f38d3 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -247,9 +247,11 @@ TASK_OUTPUTS = {
     #               [x1, y1, x2, y2],
     #               [x1, y1, x2, y2],
     #               [x1, y1, x2, y2],
-    #             ]
+    #             ],
+    #   "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
     # }
-    Tasks.video_single_object_tracking: [OutputKeys.BOXES],
+    Tasks.video_single_object_tracking:
+    [OutputKeys.BOXES, OutputKeys.TIMESTAMPS],
 
     # live category recognition result for single video
     # {
diff --git a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
index c47fc15f..4169def7 100644
--- a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
+++ b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
@@ -9,8 +9,8 @@ from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
     cfg
 from modelscope.models.cv.video_single_object_tracking.tracker.ostrack import \
     OSTrack
-from modelscope.models.cv.video_single_object_tracking.utils.utils import \
-    check_box
+from modelscope.models.cv.video_single_object_tracking.utils.utils import (
+    check_box, timestamp_format)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -45,7 +45,10 @@ class VideoSingleObjectTrackingPipeline(Pipeline):
 
     def forward(self, input: Input) -> Dict[str, Any]:
         output_boxes = []
+        output_timestamps = []
         cap = cv2.VideoCapture(self.video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_idx = 0
         success, frame = cap.read()
         if success is False:
             raise Exception(
@@ -58,6 +61,7 @@ class VideoSingleObjectTrackingPipeline(Pipeline):
             raise Exception('modelscope error: init_box out of image range ',
                             init_box)
         output_boxes.append(init_box.copy())
+        output_timestamps.append(timestamp_format(seconds=frame_idx / fps))
         init_box[2] = init_box[2] - init_box[0]
         init_box[3] = init_box[3] - init_box[1]
         self.tracker.initialize(frame, {'init_bbox': init_box})
@@ -67,14 +71,17 @@ class VideoSingleObjectTrackingPipeline(Pipeline):
             ret, frame = cap.read()
             if frame is None:
                 break
+            frame_idx += 1
             out = self.tracker.track(frame)
             state = [int(s) for s in out['target_bbox']]
             output_boxes.append(state)
+            output_timestamps.append(timestamp_format(seconds=frame_idx / fps))
         cap.release()
         logger.info('tracking process done')
 
         return {
             OutputKeys.BOXES: output_boxes,
+            OutputKeys.TIMESTAMPS: output_timestamps
         }
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:

From 357a233ee32bbaec7eaef58f383d86219b3f9cd3 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:03:00 +0800
Subject: [PATCH 15/23] [to #42322933] fix bug: checkpoint hook and
 bestckpthook exists at the same time         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10227608

---
 modelscope/trainers/default_config.py        | 19 +++++++++++++++++++
 modelscope/trainers/trainer.py               |  7 ++-----
 tests/trainers/hooks/test_checkpoint_hook.py |  3 ---
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py
index 69fdd400..c8f0c7b0 100644
--- a/modelscope/trainers/default_config.py
+++ b/modelscope/trainers/default_config.py
@@ -1,4 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.utils.config import Config
+
 DEFAULT_CONFIG = {
     'train': {
         'hooks': [{
@@ -12,3 +15,19 @@ DEFAULT_CONFIG = {
         }]
     }
 }
+
+
+def merge_cfg(cfg: Config):
+    """Merge the default config into the input cfg.
+
+    This function will pop the default CheckpointHook when the BestCkptSaverHook exists in the input cfg.
+
+    @param cfg: The input cfg to be merged into.
+    """
+    cfg.merge_from_dict(DEFAULT_CONFIG, force=False)
+    # pop duplicate hook
+
+    if any(['BestCkptSaverHook' == hook['type'] for hook in cfg.train.hooks]):
+        cfg.train.hooks = list(
+            filter(lambda hook: hook['type'] != 'CheckpointHook',
+                   cfg.train.hooks))
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index d3675720..a01d9b59 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -41,7 +41,7 @@ from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
                                           init_dist, set_random_seed)
 from .base import BaseTrainer
 from .builder import TRAINERS
-from .default_config import DEFAULT_CONFIG
+from .default_config import merge_cfg
 from .hooks.hook import Hook
 from .parallel.builder import build_parallel
 from .parallel.utils import is_parallel
@@ -114,7 +114,7 @@ class EpochBasedTrainer(BaseTrainer):
         super().__init__(cfg_file, arg_parse_fn)
 
         # add default config
-        self.cfg.merge_from_dict(self._get_default_config(), force=False)
+        merge_cfg(self.cfg)
         self.cfg = self.rebuild_config(self.cfg)
 
         if 'cfg_options' in kwargs:
@@ -951,9 +951,6 @@ class EpochBasedTrainer(BaseTrainer):
                 stage_hook_infos.append(info)
         return '\n'.join(stage_hook_infos)
 
-    def _get_default_config(self):
-        return DEFAULT_CONFIG
-
 
 def worker_init_fn(worker_id, num_workers, rank, seed):
     # The seed of each worker equals to
diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py
index c694ece6..e7f2d33c 100644
--- a/tests/trainers/hooks/test_checkpoint_hook.py
+++ b/tests/trainers/hooks/test_checkpoint_hook.py
@@ -204,9 +204,6 @@ class BestCkptSaverHookTest(unittest.TestCase):
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
-        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
-        self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
         self.assertIn(f'best_{LogKeys.EPOCH}1_{MetricKeys.ACCURACY}0.1.pth',
                       results_files)
 

From 372adb3936939c0079924cd8a761e525b4fbd77f Mon Sep 17 00:00:00 2001
From: "tingwei.gtw" <tingwei.gtw@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:04:38 +0800
Subject: [PATCH 16/23] [to #42322933] support hand-static model         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10244616

---
 data/test/images/hand_static.jpg              |   3 +
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/hand_static/__init__.py  |  20 +
 .../models/cv/hand_static/hand_model.py       |  93 +++++
 modelscope/models/cv/hand_static/networks.py  | 358 ++++++++++++++++++
 modelscope/outputs.py                         |   6 +-
 modelscope/pipelines/builder.py               |   2 +
 modelscope/pipelines/cv/__init__.py           |   4 +-
 .../pipelines/cv/hand_static_pipeline.py      |  37 ++
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_hand_static.py           |  32 ++
 11 files changed, 556 insertions(+), 2 deletions(-)
 create mode 100644 data/test/images/hand_static.jpg
 create mode 100644 modelscope/models/cv/hand_static/__init__.py
 create mode 100644 modelscope/models/cv/hand_static/hand_model.py
 create mode 100644 modelscope/models/cv/hand_static/networks.py
 create mode 100644 modelscope/pipelines/cv/hand_static_pipeline.py
 create mode 100644 tests/pipelines/test_hand_static.py

diff --git a/data/test/images/hand_static.jpg b/data/test/images/hand_static.jpg
new file mode 100644
index 00000000..43ae28b1
--- /dev/null
+++ b/data/test/images/hand_static.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94b8e281d77ee6d3ea2a8a0c9408ecdbd29fe75f33ea5399b6ea00070ba77bd6
+size 13090
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 29a35fbe..5870ebe3 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -39,6 +39,7 @@ class Models(object):
     mtcnn = 'mtcnn'
     ulfd = 'ulfd'
     video_inpainting = 'video-inpainting'
+    hand_static = 'hand-static'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -173,6 +174,7 @@ class Pipelines(object):
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
     video_inpainting = 'video-inpainting'
+    hand_static = 'hand-static'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/models/cv/hand_static/__init__.py b/modelscope/models/cv/hand_static/__init__.py
new file mode 100644
index 00000000..654d2acb
--- /dev/null
+++ b/modelscope/models/cv/hand_static/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hand_model import HandStatic
+
+else:
+    _import_structure = {'hand_model': ['HandStatic']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/hand_static/hand_model.py b/modelscope/models/cv/hand_static/hand_model.py
new file mode 100644
index 00000000..38517307
--- /dev/null
+++ b/modelscope/models/cv/hand_static/hand_model.py
@@ -0,0 +1,93 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+import sys
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch import nn
+from torchvision.transforms import transforms
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .networks import StaticGestureNet
+
+logger = get_logger()
+
+map_idx = {
+    0: 'unrecog',
+    1: 'one',
+    2: 'two',
+    3: 'bixin',
+    4: 'yaogun',
+    5: 'zan',
+    6: 'fist',
+    7: 'ok',
+    8: 'tuoju',
+    9: 'd_bixin',
+    10: 'd_fist_left',
+    11: 'd_fist_right',
+    12: 'd_hand',
+    13: 'fashe',
+    14: 'five',
+    15: 'nohand'
+}
+
+img_size = [112, 112]
+
+spatial_transform = transforms.Compose([
+    transforms.Resize(img_size),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+])
+
+
+@MODELS.register_module(Tasks.hand_static, module_name=Models.hand_static)
+class HandStatic(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = StaticGestureNet()
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+        self.params = torch.load(
+            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location=self.device)
+
+        self.model.load_state_dict(self.params)
+        self.model.to(self.device)
+        self.model.eval()
+        self.device_id = device_id
+        if self.device_id >= 0 and self.device == 'cuda':
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+    def forward(self, x):
+        pred_result = self.model(x)
+        return pred_result
+
+
+def infer(img_path, model, device):
+
+    img = Image.open(img_path)
+    clip = spatial_transform(img)
+    clip = clip.unsqueeze(0).to(device).float()
+    outputs = model(clip)
+    predicted = int(outputs.max(1)[1])
+    pred_result = map_idx.get(predicted)
+    logger.info('pred result: {}'.format(pred_result))
+
+    return pred_result
diff --git a/modelscope/models/cv/hand_static/networks.py b/modelscope/models/cv/hand_static/networks.py
new file mode 100644
index 00000000..6cf46f5d
--- /dev/null
+++ b/modelscope/models/cv/hand_static/networks.py
@@ -0,0 +1,358 @@
+""" HandStatic
+The implementation here is modified based on MobileFaceNet,
+originally Apache 2.0 License and publicly avaialbe at https://github.com/xuexingyu24/MobileFaceNet_Tutorial_Pytorch
+"""
+
+import os
+
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.models as models
+from torch.nn import (AdaptiveAvgPool2d, BatchNorm1d, BatchNorm2d, Conv2d,
+                      Dropout, Linear, MaxPool2d, Module, PReLU, ReLU,
+                      Sequential, Sigmoid)
+
+
+class StaticGestureNet(torch.nn.Module):
+
+    def __init__(self, train=True):
+        super().__init__()
+
+        model = MobileFaceNet(512)
+        self.feature_extractor = model
+        self.fc_layer = torch.nn.Sequential(
+            nn.Linear(512, 128), nn.Softplus(), nn.Linear(128, 15))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, inputs):
+        out = self.feature_extractor(inputs)
+        out = self.fc_layer(out)
+        out = self.sigmoid(out)
+        return out
+
+
+class Flatten(Module):
+
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+
+
+def l2_norm(input, axis=1):
+    norm = torch.norm(input, 2, axis, True)
+    output = torch.div(input, norm)
+    return output
+
+
+class SEModule(Module):
+
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2d(1)
+        self.fc1 = Conv2d(
+            channels,
+            channels // reduction,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+        self.relu = ReLU(inplace=True)
+        self.fc2 = Conv2d(
+            channels // reduction,
+            channels,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+        self.sigmoid = Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+
+
+class BottleneckIR(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIR, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+
+
+class BottleneckIRSE(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIRSE, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth), SEModule(depth, 16))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+    return [Bottleneck(in_channel, depth, stride)
+            ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+    if num_layers == 50:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=4),
+            get_block(in_channel=128, depth=256, num_units=14),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=13),
+            get_block(in_channel=128, depth=256, num_units=30),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=8),
+            get_block(in_channel=128, depth=256, num_units=36),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    return blocks
+
+
+class Backbone(Module):
+
+    def __init__(self, num_layers, drop_ratio, mode='ir'):
+        super(Backbone, self).__init__()
+        assert num_layers in [50, 100,
+                              152], 'num_layers should be 50,100, or 152'
+        assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
+        blocks = get_blocks(num_layers)
+        if mode == 'ir':
+            unit_module = BottleneckIR
+        elif mode == 'ir_se':
+            unit_module = BottleneckIRSE
+        self.input_layer = Sequential(
+            Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64),
+            PReLU(64))
+        self.output_layer = Sequential(
+            BatchNorm2d(512), Dropout(drop_ratio), Flatten(),
+            Linear(512 * 7 * 7, 512), BatchNorm1d(512))
+        modules = []
+        for block in blocks:
+            for bottleneck in block:
+                modules.append(
+                    unit_module(bottleneck.in_channel, bottleneck.depth,
+                                bottleneck.stride))
+        self.body = Sequential(*modules)
+
+    def forward(self, x):
+        x = self.input_layer(x)
+        x = self.body(x)
+        x = self.output_layer(x)
+        return l2_norm(x)
+
+
+class ConvBlock(Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 groups=1):
+        super(ConvBlock, self).__init__()
+        self.conv = Conv2d(
+            in_c,
+            out_channels=out_c,
+            kernel_size=kernel,
+            groups=groups,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn = BatchNorm2d(out_c)
+        self.prelu = PReLU(out_c)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.prelu(x)
+        return x
+
+
+class LinearBlock(Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 groups=1):
+        super(LinearBlock, self).__init__()
+        self.conv = Conv2d(
+            in_c,
+            out_channels=out_c,
+            kernel_size=kernel,
+            groups=groups,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn = BatchNorm2d(out_c)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class DepthWise(Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 residual=False,
+                 kernel=(3, 3),
+                 stride=(2, 2),
+                 padding=(1, 1),
+                 groups=1):
+        super(DepthWise, self).__init__()
+        self.conv = ConvBlock(
+            in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.conv_dw = ConvBlock(
+            groups,
+            groups,
+            groups=groups,
+            kernel=kernel,
+            padding=padding,
+            stride=stride)
+        self.project = LinearBlock(
+            groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.residual = residual
+
+    def forward(self, x):
+        if self.residual:
+            short_cut = x
+        x = self.conv(x)
+        x = self.conv_dw(x)
+        x = self.project(x)
+        if self.residual:
+            output = short_cut + x
+        else:
+            output = x
+        return output
+
+
+class Residual(Module):
+
+    def __init__(self,
+                 c,
+                 num_block,
+                 groups,
+                 kernel=(3, 3),
+                 stride=(1, 1),
+                 padding=(1, 1)):
+        super(Residual, self).__init__()
+        modules = []
+        for _ in range(num_block):
+            modules.append(
+                DepthWise(
+                    c,
+                    c,
+                    residual=True,
+                    kernel=kernel,
+                    padding=padding,
+                    stride=stride,
+                    groups=groups))
+        self.model = Sequential(*modules)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class MobileFaceNet(Module):
+
+    def __init__(self, embedding_size):
+        super(MobileFaceNet, self).__init__()
+        self.conv1 = ConvBlock(
+            3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1))
+        self.conv2_dw = ConvBlock(
+            64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64)
+        self.conv_23 = DepthWise(
+            64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128)
+        self.conv_3 = Residual(
+            64,
+            num_block=4,
+            groups=128,
+            kernel=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1))
+        self.conv_34 = DepthWise(
+            64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
+        self.conv_4 = Residual(
+            128,
+            num_block=6,
+            groups=256,
+            kernel=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1))
+        self.conv_45 = DepthWise(
+            128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512)
+        self.conv_5 = Residual(
+            128,
+            num_block=2,
+            groups=256,
+            kernel=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1))
+        self.conv_6_sep = ConvBlock(
+            128, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+        self.conv_6_dw = LinearBlock(
+            512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0))
+        self.conv_6_flatten = Flatten()
+        self.linear = Linear(512, embedding_size, bias=False)
+        self.bn = BatchNorm1d(embedding_size)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.conv2_dw(out)
+        out = self.conv_23(out)
+        out = self.conv_3(out)
+        out = self.conv_34(out)
+        out = self.conv_4(out)
+        out = self.conv_45(out)
+        out = self.conv_5(out)
+        out = self.conv_6_sep(out)
+        out = self.conv_6_dw(out)
+        out = self.conv_6_flatten(out)
+        out = self.linear(out)
+        return l2_norm(out)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index b96f38d3..ce9e8d07 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -632,5 +632,9 @@ TASK_OUTPUTS = {
     # {
     #     'output': ['Done' / 'Decode_Error']
     # }
-    Tasks.video_inpainting: [OutputKeys.OUTPUT]
+    Tasks.video_inpainting: [OutputKeys.OUTPUT],
+    # {
+    #     'output': ['bixin']
+    # }
+    Tasks.hand_static: [OutputKeys.OUTPUT]
 }
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 5e244b27..51d50d51 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -178,6 +178,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                               'damo/cv_vitb16_segmentation_shop-seg'),
     Tasks.video_inpainting: (Pipelines.video_inpainting,
                              'damo/cv_video-inpainting'),
+    Tasks.hand_static: (Pipelines.hand_static,
+                        'damo/cv_mobileface_hand-static'),
 }
 
 
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index a9dc05f2..55bad09a 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -52,7 +52,8 @@ if TYPE_CHECKING:
     from .ulfd_face_detection_pipeline import UlfdFaceDetectionPipeline
     from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
-    from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipeline
+    from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipelin
+    from .hand_static_pipeline import HandStaticPipeline
 
 else:
     _import_structure = {
@@ -119,6 +120,7 @@ else:
         'facial_expression_recognition_pipelin':
         ['FacialExpressionRecognitionPipeline'],
         'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'],
+        'hand_static_pipeline': ['HandStaticPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/hand_static_pipeline.py b/modelscope/pipelines/cv/hand_static_pipeline.py
new file mode 100644
index 00000000..1219c873
--- /dev/null
+++ b/modelscope/pipelines/cv/hand_static_pipeline.py
@@ -0,0 +1,37 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.hand_static import hand_model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.hand_static, module_name=Pipelines.hand_static)
+class HandStaticPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create hand static pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = hand_model.infer(input['img_path'], self.model, self.device)
+        return {OutputKeys.OUTPUT: result}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index de3d933f..75add1d9 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -42,6 +42,7 @@ class CVTasks(object):
     portrait_matting = 'portrait-matting'
     text_driven_segmentation = 'text-driven-segmentation'
     shop_segmentation = 'shop-segmentation'
+    hand_static = 'hand-static'
 
     # image editing
     skin_retouching = 'skin-retouching'
diff --git a/tests/pipelines/test_hand_static.py b/tests/pipelines/test_hand_static.py
new file mode 100644
index 00000000..37181899
--- /dev/null
+++ b/tests/pipelines/test_hand_static.py
@@ -0,0 +1,32 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class HandStaticTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model = 'damo/cv_mobileface_hand-static'
+        self.input = {'img_path': 'data/test/images/hand_static.jpg'}
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        hand_static = pipeline(Tasks.hand_static, model=self.model)
+        self.pipeline_inference(hand_static, self.input)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        hand_static = pipeline(Tasks.hand_static)
+        self.pipeline_inference(hand_static, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()

From d721fabb343c9bfe8721464dee5d4dd30d634e26 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:08:33 +0800
Subject: [PATCH 17/23] [to #42322933]bert with sequence classification / token
 classification/ fill mask refactor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1.新增支持原始bert模型（非easynlp的 backbone prefix版本）
2.支持bert的在sequence classification/fill mask /token classification上的backbone head形式
3.统一了sequence classification几个任务的pipeline到一个类
4.fill mask 支持backbone head形式
5.token classification的几个子任务（ner，word seg， part of speech）的preprocessor 统一到了一起TokenClassificationPreprocessor
6. sequence classification的几个子任务（single classification， pair classification）的preprocessor 统一到了一起SequenceClassificationPreprocessor
7. 改动register中 cls的group_key 赋值位置，之前的group_key在多个decorators的情况下，会被覆盖，obj_cls的group_key信息不正确
8. 基于backbone head形式将 原本group_key和 module同名的情况尝试做调整，如下在modelscope/pipelines/nlp/sequence_classification_pipeline.py 中
原本
 @PIPELINES.register_module(
    Tasks.sentiment_classification, module_name=Pipelines.sentiment_classification)
改成
@PIPELINES.register_module(
    Tasks.text_classification, module_name=Pipelines.sentiment_classification)
相应的configuration.json也有改动，这样的改动更符合任务和pipline（子任务）的关系。
8. 其他相应改动为支持上述功能
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10041463
---
 modelscope/metainfo.py                        |   11 +-
 modelscope/models/builder.py                  |    9 +-
 modelscope/models/nlp/__init__.py             |   22 +-
 modelscope/models/nlp/backbones/bert.py       |    7 +
 modelscope/models/nlp/bert/__init__.py        |   60 +
 .../models/nlp/bert/configuration_bert.py     |  162 ++
 modelscope/models/nlp/bert/modeling_bert.py   | 2040 +++++++++++++++++
 .../nlp/bert_for_sequence_classification.py   |   70 -
 modelscope/models/nlp/deberta_v2/__init__.py  |   10 -
 modelscope/models/nlp/heads/fill_mask_head.py |  101 +
 .../models/nlp/heads/torch_pretrain_head.py   |    2 +-
 modelscope/models/nlp/masked_language.py      |    5 +-
 .../nlp/nncrf_for_named_entity_recognition.py |    9 +-
 .../models/nlp/sequence_classification.py     |   83 +-
 modelscope/models/nlp/task_models/__init__.py |    4 +
 .../nlp/task_models/feature_extraction.py     |   43 +
 .../models/nlp/task_models/fill_mask.py       |   47 +
 .../nlp/task_models/information_extraction.py |   15 +-
 .../task_models/sequence_classification.py    |   49 +-
 .../models/nlp/task_models/task_model.py      |   29 +-
 .../nlp/task_models/token_classification.py   |   15 +-
 modelscope/models/nlp/token_classification.py |   49 +-
 modelscope/outputs.py                         |   16 +
 modelscope/pipelines/builder.py               |    7 +-
 modelscope/pipelines/nlp/__init__.py          |   19 +-
 .../nlp/feature_extraction_pipeline.py        |   82 +
 .../pipelines/nlp/fill_mask_pipeline.py       |    9 +-
 .../nlp/information_extraction_pipeline.py    |    2 +-
 .../nlp/named_entity_recognition_pipeline.py  |    5 +-
 .../pair_sentence_classification_pipeline.py  |   59 -
 .../nlp/sequence_classification_pipeline.py   |   72 +-
 .../sequence_classification_pipeline_base.py  |   62 -
 ...single_sentence_classification_pipeline.py |   56 -
 .../nlp/token_classification_pipeline.py      |    2 +-
 modelscope/preprocessors/__init__.py          |   48 +-
 modelscope/preprocessors/nlp/__init__.py      |   45 +-
 modelscope/preprocessors/nlp/nlp_base.py      |  575 ++---
 modelscope/utils/constant.py                  |    1 +
 modelscope/utils/registry.py                  |    2 +-
 tests/msdatasets/test_ms_dataset.py           |    3 +-
 tests/pipelines/test_deberta_tasks.py         |    8 +-
 tests/pipelines/test_feature_extraction.py    |   67 +
 tests/pipelines/test_fill_mask.py             |   49 +-
 .../test_named_entity_recognition.py          |   10 +-
 tests/pipelines/test_nli.py                   |   10 +-
 tests/pipelines/test_sentence_similarity.py   |   10 +-
 .../test_sentiment_classification.py          |   31 +-
 tests/pipelines/test_text_classification.py   |    4 +-
 tests/preprocessors/test_nlp.py               |   76 +
 tests/utils/test_ast.py                       |   12 +-
 50 files changed, 3347 insertions(+), 837 deletions(-)
 create mode 100644 modelscope/models/nlp/backbones/bert.py
 create mode 100644 modelscope/models/nlp/bert/__init__.py
 create mode 100644 modelscope/models/nlp/bert/configuration_bert.py
 create mode 100755 modelscope/models/nlp/bert/modeling_bert.py
 delete mode 100644 modelscope/models/nlp/bert_for_sequence_classification.py
 create mode 100644 modelscope/models/nlp/heads/fill_mask_head.py
 create mode 100644 modelscope/models/nlp/task_models/feature_extraction.py
 create mode 100644 modelscope/models/nlp/task_models/fill_mask.py
 create mode 100644 modelscope/pipelines/nlp/feature_extraction_pipeline.py
 delete mode 100644 modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
 delete mode 100644 modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
 delete mode 100644 modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
 create mode 100644 tests/pipelines/test_feature_extraction.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 5870ebe3..a1cf5e06 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -91,17 +91,22 @@ class TaskModels(object):
     text_classification = 'text-classification'
     token_classification = 'token-classification'
     information_extraction = 'information-extraction'
+    fill_mask = 'fill-mask'
+    feature_extraction = 'feature-extraction'
 
 
 class Heads(object):
     # nlp heads
+
+    # text cls
     text_classification = 'text-classification'
-    # mlm
+    # fill mask
+    fill_mask = 'fill-mask'
     bert_mlm = 'bert-mlm'
-    # roberta mlm
     roberta_mlm = 'roberta-mlm'
     # token cls
     token_classification = 'token-classification'
+    # extraction
     information_extraction = 'information-extraction'
 
 
@@ -203,6 +208,7 @@ class Pipelines(object):
     passage_ranking = 'passage-ranking'
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
+    feature_extraction = 'feature-extraction'
 
     # audio tasks
     sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -306,6 +312,7 @@ class Preprocessors(object):
     table_question_answering_preprocessor = 'table-question-answering-preprocessor'
     re_tokenizer = 're-tokenizer'
     document_segmentation = 'document-segmentation'
+    feature_extraction = 'feature-extraction'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py
index 33f111a8..7a8e28f4 100644
--- a/modelscope/models/builder.py
+++ b/modelscope/models/builder.py
@@ -37,13 +37,16 @@ def build_backbone(cfg: ConfigDict,
         cfg, BACKBONES, group_key=field, default_args=default_args)
 
 
-def build_head(cfg: ConfigDict, default_args: dict = None):
+def build_head(cfg: ConfigDict,
+               group_key: str = None,
+               default_args: dict = None):
     """ build head given config dict
 
     Args:
         cfg (:obj:`ConfigDict`): config dict for head object.
         default_args (dict, optional): Default initialization arguments.
     """
-
+    if group_key is None:
+        group_key = cfg[TYPE_NAME]
     return build_from_cfg(
-        cfg, HEADS, group_key=cfg[TYPE_NAME], default_args=default_args)
+        cfg, HEADS, group_key=group_key, default_args=default_args)
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 152a32dc..8ef96365 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -6,7 +6,6 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .backbones import SbertModel
     from .bart_for_text_error_correction import BartForTextErrorCorrection
-    from .bert_for_sequence_classification import BertForSequenceClassification
     from .bert_for_document_segmentation import BertForDocumentSegmentation
     from .csanmt_for_translation import CsanmtForTranslation
     from .heads import SequenceClassificationHead
@@ -20,12 +19,15 @@ if TYPE_CHECKING:
     from .palm_v2 import PalmForTextGeneration
     from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
     from .star_text_to_sql import StarForTextToSql
-    from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
+    from .sequence_classification import (VecoForSequenceClassification,
+                                          SbertForSequenceClassification,
+                                          BertForSequenceClassification)
     from .space import SpaceForDialogIntent
     from .space import SpaceForDialogModeling
     from .space import SpaceForDialogStateTracking
     from .table_question_answering import TableQuestionAnswering
-    from .task_models import (InformationExtractionModel,
+    from .task_models import (FeatureExtractionModel,
+                              InformationExtractionModel,
                               SequenceClassificationModel,
                               SingleBackboneTaskModelBase,
                               TokenClassificationModel)
@@ -37,7 +39,6 @@ else:
     _import_structure = {
         'backbones': ['SbertModel'],
         'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
-        'bert_for_sequence_classification': ['BertForSequenceClassification'],
         'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
         'csanmt_for_translation': ['CsanmtForTranslation'],
         'heads': ['SequenceClassificationHead'],
@@ -54,15 +55,20 @@ else:
         'palm_v2': ['PalmForTextGeneration'],
         'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
         'star_text_to_sql': ['StarForTextToSql'],
-        'sequence_classification':
-        ['VecoForSequenceClassification', 'SbertForSequenceClassification'],
+        'sequence_classification': [
+            'VecoForSequenceClassification', 'SbertForSequenceClassification',
+            'BertForSequenceClassification'
+        ],
         'space': [
             'SpaceForDialogIntent', 'SpaceForDialogModeling',
             'SpaceForDialogStateTracking'
         ],
         'task_models': [
-            'InformationExtractionModel', 'SequenceClassificationModel',
-            'SingleBackboneTaskModelBase', 'TokenClassificationModel'
+            'FeatureExtractionModel',
+            'InformationExtractionModel',
+            'SequenceClassificationModel',
+            'SingleBackboneTaskModelBase',
+            'TokenClassificationModel',
         ],
         'token_classification': ['SbertForTokenClassification'],
         'table_question_answering': ['TableQuestionAnswering'],
diff --git a/modelscope/models/nlp/backbones/bert.py b/modelscope/models/nlp/backbones/bert.py
new file mode 100644
index 00000000..aa513944
--- /dev/null
+++ b/modelscope/models/nlp/backbones/bert.py
@@ -0,0 +1,7 @@
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.models.nlp.bert import BertModel
+from modelscope.utils.constant import Fields
+
+BACKBONES.register_module(
+    group_key=Fields.nlp, module_name=Models.bert, module_cls=BertModel)
diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py
new file mode 100644
index 00000000..705d9519
--- /dev/null
+++ b/modelscope/models/nlp/bert/__init__.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .modeling_bert import (
+        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BertForMaskedLM,
+        BertForMultipleChoice,
+        BertForNextSentencePrediction,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertLayer,
+        BertLMHeadModel,
+        BertModel,
+        BertPreTrainedModel,
+        load_tf_weights_in_bert,
+    )
+
+    from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig
+    from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
+    from .tokenization_bert_fast import BertTokenizerFast
+
+else:
+    _import_structure = {
+        'configuration_bert':
+        ['BERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'BertConfig', 'BertOnnxConfig'],
+        'tokenization_bert':
+        ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer'],
+    }
+    _import_structure['tokenization_bert_fast'] = ['BertTokenizerFast']
+
+    _import_structure['modeling_bert'] = [
+        'BERT_PRETRAINED_MODEL_ARCHIVE_LIST',
+        'BertForMaskedLM',
+        'BertForMultipleChoice',
+        'BertForNextSentencePrediction',
+        'BertForPreTraining',
+        'BertForQuestionAnswering',
+        'BertForSequenceClassification',
+        'BertForTokenClassification',
+        'BertLayer',
+        'BertLMHeadModel',
+        'BertModel',
+        'BertPreTrainedModel',
+        'load_tf_weights_in_bert',
+    ]
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/bert/configuration_bert.py b/modelscope/models/nlp/bert/configuration_bert.py
new file mode 100644
index 00000000..2c9293ec
--- /dev/null
+++ b/modelscope/models/nlp/bert/configuration_bert.py
@@ -0,0 +1,162 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+from collections import OrderedDict
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`BertModel`] or a [`TFBertModel`]. It is used to instantiate a BERT model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the BERT
+    [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different
+            tokens that can be represented by the `inputs_ids` passed when
+            calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward)
+            layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, `"gelu"`, `"relu"`, `"silu"` and
+            `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the
+            embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or
+            1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling
+            [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`,
+            `"relative_key"`, `"relative_key_query"`. For positional embeddings
+            use `"absolute"`. For more information on `"relative_key"`, please
+            refer to [Self-Attention with Relative Position Representations
+            (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more
+            information on `"relative_key_query"`, please refer to *Method 4* in
+            [Improve Transformer Models with Better Relative Position Embeddings
+            (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python >>> from transformers import BertModel, BertConfig
+
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = BertConfig()
+
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'bert'
+
+    def __init__(self,
+                 vocab_size=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=0,
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 classifier_dropout=None,
+                 **kwargs):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class BertOnnxConfig(OnnxConfig):
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict([
+            ('input_ids', {
+                0: 'batch',
+                1: 'sequence'
+            }),
+            ('attention_mask', {
+                0: 'batch',
+                1: 'sequence'
+            }),
+            ('token_type_ids', {
+                0: 'batch',
+                1: 'sequence'
+            }),
+        ])
diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py
new file mode 100755
index 00000000..f8fd5994
--- /dev/null
+++ b/modelscope/models/nlp/bert/modeling_bert.py
@@ -0,0 +1,2040 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (ModelOutput, add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    MultipleChoiceModelOutput, NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput, SequenceClassifierOutput,
+    TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.models.base import TorchModel
+from modelscope.utils.logger import get_logger
+from .configuration_bert import BertConfig
+
+logger = get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'BertConfig'
+
+
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f'Converting TensorFlow checkpoint from {tf_path}')
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f'Loading TF weight {name} with shape {shape}')
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in [
+                'adam_v', 'adam_m', 'AdamWeightDecayOptimizer',
+                'AdamWeightDecayOptimizer_1', 'global_step'
+        ] for n in name):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                scope_names = re.split(r'_(\d+)', m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == 'kernel' or scope_names[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif scope_names[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(
+                    f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
+                )
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f'Initialize PyTorch weight {name}')
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model
+        # variable name and be able to load any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and
+        # exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor
+        # where it is all zeros, which usually occurs when its auto-generated,
+        # registered buffer helps users when tracing the model without passing
+        # token_type_ids, solves issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, 'position_embedding_type', 'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all
+            # cross attention key/value_states. Further calls to cross_attention
+            # layer can then reuse all cross-attention key/value_states (first
+            # "if" case) if uni-directional self-attention (decoder) save
+            # Tuple(torch.Tensor, torch.Tensor) of all previous decoder
+            # key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected
+            # key/value_states (third "elif" case) if encoder bi-directional
+            # self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(
+            config, position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = BertAttention(
+                config, position_embedding_type='absolute')
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated '
+                    f'with cross-attention layers by setting `config.add_cross_attention=True`'
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided,
+        `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the
+            next sequence prediction (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each
+            vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size,
+        2)`):
+            Prediction scores of the next sequence prediction (classification)
+            head (scores of True/False continuation before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings +
+            one for the output of each layer) of shape `(batch_size,
+            sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+BERT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the
+        parameters of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`,
+        *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+            plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.',
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a
+    decoder, in which case a layer of cross-attention is added between the
+    self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
+    Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`. To be used in a
+    Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
+    is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @classmethod
+    def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config):
+        config = BertConfig(**config)
+        model = cls(config, add_pooling_layer)
+        return model
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder. Used in the cross-attention if the model is configured as a
+            decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of
+            the encoder input. This mask is used in the cross-attention if the
+            model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def extract_sequence_outputs(self, outputs):
+        return outputs['last_hidden_state']
+
+    def extract_pooled_outputs(self, outputs):
+        return outputs['pooler_output']
+
+
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked
+    language modeling` head and a `next sentence prediction (classification)`
+    head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+            *optional*):
+                Labels for computing the masked language modeling loss. Indices
+                should be in `[-100, 0, ..., config.vocab_size]` (see
+                `input_ids` docstring) Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with
+                labels in `[0, ..., config.vocab_size]`
+            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`,
+            *optional*):
+                Labels for computing the next sequence prediction
+                (classification) loss. Input should be a sequence pair (see
+                `input_ids` docstring) Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+                Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """,
+    BERT_START_DOCSTRING)
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                'If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`'
+            )
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+            sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the
+                encoder. Used in the cross-attention if the model is configured
+                as a decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+            sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices
+                of the encoder input. This mask is used in the cross-attention
+                if the model is configured as a decoder. Mask values selected in
+                `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+            *optional*):
+                Labels for computing the left-to-right language modeling loss
+                (next word prediction). Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with
+                indices set to `-100` are ignored (masked), the loss is only
+                computed for the tokens with labels n `[0, ...,
+                config.vocab_size]`
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+            `config.n_layers` with each tuple having 4 tensors of shape
+            `(batch_size, num_heads, sequence_length - 1,
+            embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the
+                attention blocks. Can be used to speed up decoding.
+
+                If `past_key_values` are used, the user can optionally input
+                only the last `decoder_input_ids` (those that don't have their
+                past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+                `(batch_size, sequence_length)`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are
+                returned and can be used to speed up decoding (see
+                `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import BertTokenizer, BertLMHeadModel,
+        BertConfig >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> config = BertConfig.from_pretrained("bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'past_key_values': past
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top. """,
+    BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+        *optional*):
+            Labels for computing the masked language modeling loss. Indices
+            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
+            docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ...,
+            config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+
+        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification)
+            loss. Input should be a sequence pair (see `input_ids` docstring).
+            Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example:
+
+        ```python >>> from transformers import BertTokenizer,
+        BertForNextSentencePrediction >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        ```
+        """
+
+        if 'next_sentence_label' in kwargs:
+            warnings.warn(
+                'The `next_sentence_label` argument is deprecated, use `labels` instead.',
+                FutureWarning,
+            )
+            labels = kwargs.pop('next_sentence_label')
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(
+                seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores, ) + outputs[2:]
+            return ((next_sentence_loss, )
+                    + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top
+    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[0, ..., config.num_labels - 1]`. If
+            `config.num_labels == 1` a regression loss is computed (Mean-Square
+            loss), If `config.num_labels > 1` a classification loss is computed
+            (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer
+    on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format(
+            'batch_size, num_choices, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in `[0, ..., num_choices-1]` where `num_choices`
+            is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[
+            1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(
+            -1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(
+            -1,
+            attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(
+            -1,
+            token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None else None)
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+        *optional*):
+            Labels for computing the token classification loss. Indices should
+            be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive
+    question-answering tasks like SQuAD (a linear layers on top of the
+    hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`,
+        *optional*):
+            Labels for position (index) of the start of the labelled span for
+            computing the token classification loss. Positions are clamped to
+            the length of the sequence (`sequence_length`). Position outside of
+            the sequence are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for
+            computing the token classification loss. Positions are clamped to
+            the length of the sequence (`sequence_length`). Position outside of
+            the sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/bert_for_sequence_classification.py b/modelscope/models/nlp/bert_for_sequence_classification.py
deleted file mode 100644
index 2b1a3b3b..00000000
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-from typing import Any, Dict
-
-import json
-import numpy as np
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-
-__all__ = ['BertForSequenceClassification']
-
-
-@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
-class BertForSequenceClassification(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        # Model.__init__(self, model_dir, model_cls, first_sequence, *args, **kwargs)
-        # Predictor.__init__(self, *args, **kwargs)
-        """initialize the sequence classification model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-
-        super().__init__(model_dir, *args, **kwargs)
-        import torch
-        from easynlp.appzoo import SequenceClassification
-        from easynlp.core.predictor import get_model_predictor
-        self.model = get_model_predictor(
-            model_dir=self.model_dir,
-            model_cls=SequenceClassification,
-            input_keys=[('input_ids', torch.LongTensor),
-                        ('attention_mask', torch.LongTensor),
-                        ('token_type_ids', torch.LongTensor)],
-            output_keys=['predictions', 'probabilities', 'logits'])
-
-        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
-        with open(self.label_path) as f:
-            self.label_mapping = json.load(f)
-        self.id2label = {idx: name for name, idx in self.label_mapping.items()}
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-                Example:
-                    {
-                        'predictions': array([1]), # lable 0-negative 1-positive
-                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                    }
-        """
-        return self.model.predict(input)
-
-    def postprocess(self, inputs: Dict[str, np.ndarray],
-                    **kwargs) -> Dict[str, np.ndarray]:
-        # N x num_classes
-        probs = inputs['probabilities']
-        result = {
-            'probs': probs,
-        }
-
-        return result
diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py
index 664fc6c6..830210ed 100644
--- a/modelscope/models/nlp/deberta_v2/__init__.py
+++ b/modelscope/models/nlp/deberta_v2/__init__.py
@@ -21,21 +21,12 @@ from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
 
-_import_structure = {
-    'configuration_deberta_v2': [
-        'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config',
-        'DebertaV2OnnxConfig'
-    ],
-    'tokenization_deberta_v2': ['DebertaV2Tokenizer'],
-}
-
 if TYPE_CHECKING:
     from .configuration_deberta_v2 import DebertaV2Config
     from .tokenization_deberta_v2 import DebertaV2Tokenizer
     from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
 
     from .modeling_deberta_v2 import (
-        DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
         DebertaV2ForMaskedLM,
         DebertaV2ForMultipleChoice,
         DebertaV2ForQuestionAnswering,
@@ -55,7 +46,6 @@ else:
         'DebertaV2TokenizerFast'
     ]
     _import_structure['modeling_deberta_v2'] = [
-        'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST',
         'DebertaV2ForMaskedLM',
         'DebertaV2ForMultipleChoice',
         'DebertaV2ForQuestionAnswering',
diff --git a/modelscope/models/nlp/heads/fill_mask_head.py b/modelscope/models/nlp/heads/fill_mask_head.py
new file mode 100644
index 00000000..6b0c5e05
--- /dev/null
+++ b/modelscope/models/nlp/heads/fill_mask_head.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
+class BertFillMaskHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cls = BertOnlyMLMHead(self.config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.cls(sequence_output)
+        return {OutputKeys.LOGITS: prediction_scores}
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        loss_fct = CrossEntropyLoss()  # -100 index = padding token
+        masked_lm_loss = loss_fct(
+            outputs.view(-1, self.config.vocab_size), labels.view(-1))
+        return {OutputKeys.LOSS: masked_lm_loss}
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py
index fb54637b..e477533f 100644
--- a/modelscope/models/nlp/heads/torch_pretrain_head.py
+++ b/modelscope/models/nlp/heads/torch_pretrain_head.py
@@ -11,7 +11,7 @@ from modelscope.models.builder import HEADS
 from modelscope.utils.constant import Tasks
 
 
-@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
+# @HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
 class BertMLMHead(BertOnlyMLMHead, TorchHead):
 
     def compute_loss(self, outputs: Dict[str, torch.Tensor],
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
index 514a04cd..b7a890c1 100644
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -1,10 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from transformers import BertForMaskedLM as BertForMaskedLMTransformer
-
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import \
+    BertForMaskedLM as BertForMaskedLMTransformer
 from modelscope.models.nlp.deberta_v2 import \
     DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
 from modelscope.models.nlp.structbert import SbertForMaskedLM
diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
index 62198ed2..8b0c59b2 100644
--- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
@@ -41,12 +41,9 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel):
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         input_tensor = {
-            'input_ids':
-            torch.tensor(input['input_ids']).unsqueeze(0),
-            'attention_mask':
-            torch.tensor(input['attention_mask']).unsqueeze(0),
-            'label_mask':
-            torch.tensor(input['label_mask'], dtype=torch.bool).unsqueeze(0)
+            'input_ids': input['input_ids'],
+            'attention_mask': input['attention_mask'],
+            'label_mask': input['label_mask'],
         }
         output = {
             'text': input['text'],
diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py
index a8930e68..156c615c 100644
--- a/modelscope/models/nlp/sequence_classification.py
+++ b/modelscope/models/nlp/sequence_classification.py
@@ -7,6 +7,7 @@ from torch import nn
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertPreTrainedModel
 from modelscope.models.nlp.structbert import SbertPreTrainedModel
 from modelscope.models.nlp.veco import \
     VecoForSequenceClassification as VecoForSequenceClassificationTransform
@@ -16,7 +17,10 @@ from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
 
-__all__ = ['SbertForSequenceClassification', 'VecoForSequenceClassification']
+__all__ = [
+    'SbertForSequenceClassification', 'VecoForSequenceClassification',
+    'BertForSequenceClassification'
+]
 
 
 class SequenceClassificationBase(TorchModel):
@@ -132,7 +136,7 @@ class SbertForSequenceClassification(SequenceClassificationBase,
             label2id = parse_label_mapping(model_dir)
             if label2id is not None and len(label2id) > 0:
                 num_labels = len(label2id)
-
+            cls.id2label = {id: label for label, id in label2id.items()}
         model_args = {} if num_labels is None else {'num_labels': num_labels}
         return super(SbertPreTrainedModel,
                      SbertForSequenceClassification).from_pretrained(
@@ -206,3 +210,78 @@ class VecoForSequenceClassification(TorchModel,
                          pretrained_model_name_or_path=kwargs.get('model_dir'),
                          model_dir=kwargs.get('model_dir'),
                          **model_args)
+
+
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.bert)
+@MODELS.register_module(Tasks.nli, module_name=Models.bert)
+@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
+class BertForSequenceClassification(SequenceClassificationBase,
+                                    BertPreTrainedModel):
+    """Bert sequence classification model.
+
+        Inherited from SequenceClassificationBase.
+    """
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .bert import BertModel
+        return BertModel(self.config, add_pooling_layer=True)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(BertPreTrainedModel,
+                     BertForSequenceClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index 7493ba74..90f22aa1 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -5,6 +5,8 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .information_extraction import InformationExtractionModel
+    from .feature_extraction import FeatureExtractionModel
+    from .fill_mask import FillMaskModel
     from .sequence_classification import SequenceClassificationModel
     from .task_model import SingleBackboneTaskModelBase
     from .token_classification import TokenClassificationModel
@@ -12,6 +14,8 @@ if TYPE_CHECKING:
 else:
     _import_structure = {
         'information_extraction': ['InformationExtractionModel'],
+        'feature_extraction': ['FeatureExtractionModel'],
+        'fill_mask': ['FillMaskModel'],
         'sequence_classification': ['SequenceClassificationModel'],
         'task_model': ['SingleBackboneTaskModelBase'],
         'token_classification': ['TokenClassificationModel'],
diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py
new file mode 100644
index 00000000..069c37aa
--- /dev/null
+++ b/modelscope/models/nlp/task_models/feature_extraction.py
@@ -0,0 +1,43 @@
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertConfig
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+
+__all__ = ['FeatureExtractionModel']
+
+
+@MODELS.register_module(
+    Tasks.feature_extraction, module_name=TaskModels.feature_extraction)
+class FeatureExtractionModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the fill mask model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        self.build_backbone(self.backbone_cfg)
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+
+        return {OutputKeys.TEXT_EMBEDDING: sequence_output}
diff --git a/modelscope/models/nlp/task_models/fill_mask.py b/modelscope/models/nlp/task_models/fill_mask.py
new file mode 100644
index 00000000..f7ef1cc2
--- /dev/null
+++ b/modelscope/models/nlp/task_models/fill_mask.py
@@ -0,0 +1,47 @@
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertConfig
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+
+__all__ = ['FillMaskModel']
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=TaskModels.fill_mask)
+class FillMaskModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the fill mask model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(sequence_output)
+
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+            loss = self.compute_loss(outputs, labels)
+            outputs.update(loss)
+        outputs[OutputKeys.INPUT_IDS] = input[OutputKeys.INPUT_IDS]
+        return outputs
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
index 4792d07c..0a7d5a47 100644
--- a/modelscope/models/nlp/task_models/information_extraction.py
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -26,21 +26,12 @@ class InformationExtractionModel(SingleBackboneTaskModelBase):
         """
         super().__init__(model_dir, *args, **kwargs)
 
-        backbone_cfg = self.cfg.backbone
-        head_cfg = self.cfg.head
-        self.build_backbone(backbone_cfg)
-        self.build_head(head_cfg)
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
         outputs = super().forward(input)
         sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
         outputs = self.head.forward(sequence_output, input['text'],
                                     input['offsets'])
         return {OutputKeys.SPO_LIST: outputs}
-
-    def extract_backbone_outputs(self, outputs):
-        sequence_output = None
-        pooled_output = None
-        if hasattr(self.backbone, 'extract_sequence_outputs'):
-            sequence_output = self.backbone.extract_sequence_outputs(outputs)
-        return sequence_output, pooled_output
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
index 43a96327..1f5e46c3 100644
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -11,10 +11,14 @@ from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
 
 __all__ = ['SequenceClassificationModel']
 
 
+@MODELS.register_module(
+    Tasks.sentence_similarity, module_name=TaskModels.text_classification)
+@MODELS.register_module(Tasks.nli, module_name=TaskModels.text_classification)
 @MODELS.register_module(
     Tasks.sentiment_classification, module_name=TaskModels.text_classification)
 @MODELS.register_module(
@@ -31,49 +35,36 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase):
         if 'base_model_prefix' in kwargs:
             self._base_model_prefix = kwargs['base_model_prefix']
 
-        backbone_cfg = self.cfg.backbone
-        head_cfg = self.cfg.head
-
         # get the num_labels from label_mapping.json
         self.id2label = {}
-        self.label_path = os.path.join(model_dir, 'label_mapping.json')
-        if os.path.exists(self.label_path):
-            with open(self.label_path) as f:
-                self.label_mapping = json.load(f)
-            self.id2label = {
-                idx: name
-                for name, idx in self.label_mapping.items()
-            }
-        head_cfg['num_labels'] = len(self.label_mapping)
+        # get the num_labels
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+            self.id2label = {id: label for label, id in label2id.items()}
+        self.head_cfg['num_labels'] = num_labels
 
-        self.build_backbone(backbone_cfg)
-        self.build_head(head_cfg)
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
 
     def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
         outputs = super().forward(input)
         sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
         outputs = self.head.forward(pooled_output)
-        if 'labels' in input:
-            loss = self.compute_loss(outputs, input['labels'])
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+            loss = self.compute_loss(outputs, labels)
             outputs.update(loss)
         return outputs
 
     def extract_logits(self, outputs):
         return outputs[OutputKeys.LOGITS].cpu().detach()
 
-    def extract_backbone_outputs(self, outputs):
-        sequence_output = None
-        pooled_output = None
-        if hasattr(self.backbone, 'extract_sequence_outputs'):
-            sequence_output = self.backbone.extract_sequence_outputs(outputs)
-        if hasattr(self.backbone, 'extract_pooled_outputs'):
-            pooled_output = self.backbone.extract_pooled_outputs(outputs)
-        return sequence_output, pooled_output
-
-    def compute_loss(self, outputs, labels):
-        loss = self.head.compute_loss(outputs, labels)
-        return loss
-
     def postprocess(self, input, **kwargs):
         logits = self.extract_logits(input)
         probs = logits.softmax(-1).numpy()
diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py
index e93dd5f6..0b43044f 100644
--- a/modelscope/models/nlp/task_models/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -74,7 +74,7 @@ class BaseTaskModel(TorchModel, ABC):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
-        self.cfg = ConfigDict(kwargs)
+        self.config = ConfigDict(kwargs)
 
     def __repr__(self):
         # only log backbone and head name
@@ -397,6 +397,9 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
+        self.backbone_cfg = self.config.get('backbone', None)
+        assert self.backbone_cfg is not None
+        self.head_cfg = self.config.get('head', None)
 
     def build_backbone(self, cfg):
         if 'prefix' in cfg:
@@ -405,9 +408,13 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
         setattr(self, cfg['prefix'], backbone)
 
     def build_head(self, cfg):
+        if cfg is None:
+            raise ValueError(
+                'Head config is missing, check if this was a backbone-only model'
+            )
         if 'prefix' in cfg:
             self._head_prefix = cfg['prefix']
-        head = build_head(cfg)
+        head = build_head(cfg, group_key=self.group_key)
         setattr(self, self._head_prefix, head)
         return head
 
@@ -431,8 +438,18 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
             outputs = self.backbone.forward(**input)
         return outputs
 
-    def compute_loss(self, outputs: Dict[str, Any], labels):
-        raise NotImplementedError()
+    def compute_loss(self, outputs, labels):
+        loss = self.head.compute_loss(outputs, labels)
+        return loss
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        if hasattr(self.backbone, 'extract_pooled_outputs'):
+            pooled_output = self.backbone.extract_pooled_outputs(outputs)
+        return sequence_output, pooled_output
 
 
 class EncoderDecoderTaskModelBase(BaseTaskModel):
@@ -453,7 +470,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel):
 
     def build_encoder(self):
         encoder = build_backbone(
-            self.cfg,
+            self.config,
             type_name=self._encoder_key_in_cfg,
             task_name=Tasks.backbone)
         setattr(self, self._encoder_prefix, encoder)
@@ -461,7 +478,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel):
 
     def build_decoder(self):
         decoder = build_backbone(
-            self.cfg,
+            self.config,
             type_name=self._decoder_key_in_cfg,
             task_name=Tasks.backbone)
         setattr(self, self._decoder_prefix, decoder)
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index 5c22098f..f3930182 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -31,9 +31,6 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
         if 'base_model_prefix' in kwargs:
             self._base_model_prefix = kwargs['base_model_prefix']
 
-        backbone_cfg = self.cfg.backbone
-        head_cfg = self.cfg.head
-
         # get the num_labels
         num_labels = kwargs.get('num_labels')
         if num_labels is None:
@@ -41,12 +38,12 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
             if label2id is not None and len(label2id) > 0:
                 num_labels = len(label2id)
             self.id2label = {id: label for label, id in label2id.items()}
-        head_cfg['num_labels'] = num_labels
+        self.head_cfg['num_labels'] = num_labels
 
-        self.build_backbone(backbone_cfg)
-        self.build_head(head_cfg)
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
         labels = None
         if OutputKeys.LABEL in input:
             labels = input.pop(OutputKeys.LABEL)
@@ -71,10 +68,6 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
             sequence_output = self.backbone.extract_sequence_outputs(outputs)
         return sequence_output, pooled_output
 
-    def compute_loss(self, outputs, labels):
-        loss = self.head.compute_loss(outputs, labels)
-        return loss
-
     def postprocess(self, input, **kwargs):
         logits = self.extract_logits(input)
         pred = torch.argmax(logits[0], dim=-1)
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
index c3723a61..c63e8037 100644
--- a/modelscope/models/nlp/token_classification.py
+++ b/modelscope/models/nlp/token_classification.py
@@ -10,12 +10,13 @@ from torch import nn
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.bert import BertPreTrainedModel
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
-from .structbert import SbertPreTrainedModel
 
 __all__ = ['SbertForTokenClassification']
 
@@ -171,3 +172,49 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
                          pretrained_model_name_or_path=kwargs.get('model_dir'),
                          model_dir=kwargs.get('model_dir'),
                          **model_args)
+
+
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
+@MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
+class BertForSequenceClassification(TokenClassification, BertPreTrainedModel):
+    """Bert token classification model.
+
+        Inherited from TokenClassificationBase.
+    """
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .bert import BertModel
+        return BertModel(self.config, add_pooling_layer=True)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index ce9e8d07..357afd07 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -417,6 +417,22 @@ TASK_OUTPUTS = {
     # }
     Tasks.fill_mask: [OutputKeys.TEXT],
 
+    # feature extraction result for single sample
+    # {
+    #   "text_embedding": [[
+    #     [1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04],
+    #     [6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01],
+    #     [2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05]
+    #   ],
+    #   [
+    #     [2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05],
+    #     [8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05],
+    #     [3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05]
+    #   ]
+    # ]
+    # }
+    Tasks.feature_extraction: [OutputKeys.TEXT_EMBEDDING],
+
     # (Deprecated) dialog intent prediction result for single sample
     # {'output': {'prediction': array([2.62349960e-03, 4.12110658e-03, 4.12748595e-05, 3.77560973e-05,
     #        1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04,
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 51d50d51..4f6873b0 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -52,8 +52,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                                    'damo/cv_vit_object-detection_coco'),
     Tasks.image_denoising: (Pipelines.image_denoise,
                             'damo/cv_nafnet_image-denoise_sidd'),
-    Tasks.text_classification: (Pipelines.sentiment_analysis,
-                                'damo/bert-base-sst2'),
+    Tasks.text_classification:
+    (Pipelines.sentiment_classification,
+     'damo/nlp_structbert_sentiment-classification_chinese-base'),
     Tasks.text_generation: (Pipelines.text_generation,
                             'damo/nlp_palm2.0_text-generation_chinese-base'),
     Tasks.zero_shot_classification:
@@ -80,6 +81,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.ocr_detection: (Pipelines.ocr_detection,
                           'damo/cv_resnet18_ocr-detection-line-level_damo'),
     Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
+    Tasks.feature_extraction: (Pipelines.feature_extraction,
+                               'damo/pert_feature-extraction_base-test'),
     Tasks.action_recognition: (Pipelines.action_recognition,
                                'damo/cv_TAdaConv_action-recognition'),
     Tasks.action_detection: (Pipelines.action_detection,
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index a8edc21a..5267b5b2 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -11,12 +11,13 @@ if TYPE_CHECKING:
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
+    from .feature_extraction_pipeline import FeatureExtractionPipeline
     from .fill_mask_pipeline import FillMaskPipeline
     from .fill_mask_ponet_pipeline import FillMaskPonetPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
-    from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
-    from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline
+    from .passage_ranking_pipeline import PassageRankingPipeline
+    from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
     from .sequence_classification_pipeline import SequenceClassificationPipeline
     from .summarization_pipeline import SummarizationPipeline
     from .text_classification_pipeline import TextClassificationPipeline
@@ -27,8 +28,7 @@ if TYPE_CHECKING:
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline
     from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
-    from .passage_ranking_pipeline import PassageRankingPipeline
-    from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
+
 else:
     _import_structure = {
         'conversational_text_to_sql_pipeline':
@@ -41,16 +41,15 @@ else:
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
         'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
         'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
+        'feature_extraction_pipeline': ['FeatureExtractionPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
         'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'],
+        'information_extraction_pipeline': ['InformationExtractionPipeline'],
         'named_entity_recognition_pipeline':
         ['NamedEntityRecognitionPipeline'],
-        'information_extraction_pipeline': ['InformationExtractionPipeline'],
-        'pair_sentence_classification_pipeline':
-        ['PairSentenceClassificationPipeline'],
+        'passage_ranking_pipeline': ['PassageRankingPipeline'],
+        'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
         'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
-        'single_sentence_classification_pipeline':
-        ['SingleSentenceClassificationPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
         'text_classification_pipeline': ['TextClassificationPipeline'],
         'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
@@ -61,8 +60,6 @@ else:
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
-        'passage_ranking_pipeline': ['PassageRankingPipeline'],
-        'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
new file mode 100644
index 00000000..3af0c28d
--- /dev/null
+++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
@@ -0,0 +1,82 @@
+import os
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import NLPPreprocessor, Preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['FeatureExtractionPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.feature_extraction, module_name=Pipelines.feature_extraction)
+class FeatureExtractionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='sentence',
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp feature extraction pipeline for prediction
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported feature extraction task, or a
+            no-head model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            first_sequence: The key to read the sentence in.
+            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
+
+            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            param will have no effect.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipe_ins = pipeline('feature_extraction', model='damo/nlp_structbert_feature-extraction_english-large')
+            >>> input = 'Everything you love is treasure'
+            >>> print(pipe_ins(input))
+
+
+        """
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+
+        if preprocessor is None:
+            preprocessor = NLPPreprocessor(
+                model.model_dir,
+                padding=kwargs.pop('padding', False),
+                sequence_length=kwargs.pop('sequence_length', 128))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+        self.config = Config.from_file(
+            os.path.join(model.model_dir, ModelFile.CONFIGURATION))
+        self.tokenizer = preprocessor.tokenizer
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        return {
+            OutputKeys.TEXT_EMBEDDING:
+            inputs[OutputKeys.TEXT_EMBEDDING].tolist()
+        }
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 12f4b80f..3d515e2d 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -10,7 +10,7 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import FillMaskPreprocessor, Preprocessor
+from modelscope.preprocessors import NLPPreprocessor, Preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -57,7 +57,7 @@ class FillMaskPipeline(Pipeline):
             model, Model) else Model.from_pretrained(model)
 
         if preprocessor is None:
-            preprocessor = FillMaskPreprocessor(
+            preprocessor = NLPPreprocessor(
                 fill_mask_model.model_dir,
                 first_sequence=first_sequence,
                 second_sequence=None,
@@ -118,7 +118,10 @@ class FillMaskPipeline(Pipeline):
         logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
         input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
         pred_ids = np.argmax(logits, axis=-1)
-        model_type = self.model.config.model_type
+        if hasattr(self.model.config, 'backbone'):
+            model_type = self.model.config.backbone.type
+        else:
+            model_type = self.model.config.model_type
         process_type = model_type if model_type in self.mask_id else _type_map[
             model_type]
         rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
index 07223d07..763e941c 100644
--- a/modelscope/pipelines/nlp/information_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -36,7 +36,7 @@ class InformationExtractionPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any],
                     **postprocess_params) -> Dict[str, str]:
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 467d7aba..7275feca 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -9,7 +9,8 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NERPreprocessor, Preprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['NamedEntityRecognitionPipeline']
@@ -46,7 +47,7 @@ class NamedEntityRecognitionPipeline(Pipeline):
         model = model if isinstance(model,
                                     Model) else Model.from_pretrained(model)
         if preprocessor is None:
-            preprocessor = NERPreprocessor(
+            preprocessor = TokenClassificationPreprocessor(
                 model.model_dir,
                 sequence_length=kwargs.pop('sequence_length', 512))
         model.eval()
diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
deleted file mode 100644
index bdb75c73..00000000
--- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Union
-
-from modelscope.models.base import Model
-from ...metainfo import Pipelines
-from ...preprocessors import (PairSentenceClassificationPreprocessor,
-                              Preprocessor)
-from ...utils.constant import Tasks
-from ..builder import PIPELINES
-from .sequence_classification_pipeline_base import \
-    SequenceClassificationPipelineBase
-
-__all__ = ['PairSentenceClassificationPipeline']
-
-
-@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
-@PIPELINES.register_module(
-    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
-class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Preprocessor = None,
-                 first_sequence='first_sequence',
-                 second_sequence='second_sequence',
-                 **kwargs):
-        """Use `model` and `preprocessor` to create a nlp pair sequence classification pipeline for prediction.
-
-        Args:
-            model (str or Model): Supply either a local model dir which supported the sequence classification task,
-            or a model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the first sentence in.
-            second_sequence: The key to read the second sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value.
-
-            NOTE: Inputs of type 'tuple' or 'list' are also supported. In this scenario, the 'first_sequence' and
-            'second_sequence' param will have no effect.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='nli', model='damo/nlp_structbert_nli_chinese-base')
-            >>> sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
-            >>> sentence2 = '四川商务职业学院商务管理在哪个校区？'
-            >>> print(pipeline_ins((sentence1, sentence2)))
-            >>> # Or use the dict input:
-            >>> print(pipeline_ins({'first_sequence': sentence1, 'second_sequence': sentence2}))
-
-            To view other examples plese check the tests/pipelines/test_nli.py.
-        """
-        if preprocessor is None:
-            preprocessor = PairSentenceClassificationPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence,
-                sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
index 7fe8aace..8d0e1dcd 100644
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -1,48 +1,64 @@
 from typing import Any, Dict, Union
 
 import numpy as np
+import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import BertForSequenceClassification
+from modelscope.models.base import Model
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      SequenceClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
-__all__ = ['SequenceClassificationPipeline']
-
 
 @PIPELINES.register_module(
     Tasks.text_classification, module_name=Pipelines.sentiment_analysis)
+@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
+@PIPELINES.register_module(
+    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.sentiment_classification)
 class SequenceClassificationPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[BertForSequenceClassification, str],
-                 preprocessor: SequenceClassificationPreprocessor = None,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
                  **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+        """This is the base class for all the sequence classification sub-tasks.
 
         Args:
-            model (BertForSequenceClassification): a model instance
-            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
+            model (str or Model): A model instance or a model local dir or a model id in the model hub.
+            preprocessor (Preprocessor): a preprocessor instance, must not be None.
         """
-        assert isinstance(model, str) or isinstance(model, BertForSequenceClassification), \
-            'model must be a single str or BertForSequenceClassification'
-        sc_model = model if isinstance(
-            model,
-            BertForSequenceClassification) else Model.from_pretrained(model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        first_sequence = kwargs.pop('first_sequence', 'first_sequence')
+        second_sequence = kwargs.pop('second_sequence', None)
+
         if preprocessor is None:
             preprocessor = SequenceClassificationPreprocessor(
-                sc_model.model_dir,
-                first_sequence='sentence',
-                second_sequence=None,
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence,
+                second_sequence=second_sequence,
                 sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
 
-        assert hasattr(self.model, 'id2label'), \
-            'id2label map should be initalizaed in init function.'
+        assert preprocessor is not None
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
@@ -50,20 +66,18 @@ class SequenceClassificationPipeline(Pipeline):
         """process the prediction results
 
         Args:
-            inputs (Dict[str, Any]): input data dict
-            topk (int): return topk classification result.
-
+            inputs (Dict[str, Any]): _description_
+            topk (int): The topk probs to take
         Returns:
             Dict[str, str]: the prediction results
         """
-        # NxC np.ndarray
-        probs = inputs['probs'][0]
+
+        probs = inputs[OutputKeys.PROBABILITIES][0]
         num_classes = probs.shape[0]
         topk = min(topk, num_classes)
         top_indices = np.argpartition(probs, -topk)[-topk:]
         cls_ids = top_indices[np.argsort(probs[top_indices])]
         probs = probs[cls_ids].tolist()
 
-        cls_names = [self.model.id2label[cid] for cid in cls_ids]
-
+        cls_names = [self.id2label[cid] for cid in cls_ids]
         return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
deleted file mode 100644
index 3d8e8fea..00000000
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.models.base import Model
-from modelscope.outputs import OutputKeys
-from ...preprocessors import Preprocessor
-from ..base import Pipeline
-
-
-class SequenceClassificationPipelineBase(Pipeline):
-
-    def __init__(self, model: Union[Model, str], preprocessor: Preprocessor,
-                 **kwargs):
-        """This is the base class for all the sequence classification sub-tasks.
-
-        Args:
-            model (str or Model): A model instance or a model local dir or a model id in the model hub.
-            preprocessor (Preprocessor): a preprocessor instance, must not be None.
-        """
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or Model'
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
-        assert preprocessor is not None
-        model.eval()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.id2label = kwargs.get('id2label')
-        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
-            self.id2label = self.preprocessor.id2label
-        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
-                                          'as a parameter or make sure the preprocessor has the attribute.'
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
-
-    def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-            topk (int): The topk probs to take
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs[OutputKeys.PROBABILITIES][0]
-        num_classes = probs.shape[0]
-        topk = min(topk, num_classes)
-        top_indices = np.argpartition(probs, -topk)[-topk:]
-        cls_ids = top_indices[np.argsort(probs[top_indices])]
-        probs = probs[cls_ids].tolist()
-
-        cls_names = [self.id2label[cid] for cid in cls_ids]
-        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
deleted file mode 100644
index 0a2f6d25..00000000
--- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Union
-
-from ...metainfo import Pipelines
-from ...models import Model
-from ...preprocessors import (Preprocessor,
-                              SingleSentenceClassificationPreprocessor)
-from ...utils.constant import Tasks
-from ..builder import PIPELINES
-from .sequence_classification_pipeline_base import \
-    SequenceClassificationPipelineBase
-
-__all__ = ['SingleSentenceClassificationPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.sentiment_classification,
-    module_name=Pipelines.sentiment_classification)
-class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Preprocessor = None,
-                 first_sequence='first_sequence',
-                 **kwargs):
-        """Use `model` and `preprocessor` to create a nlp single sequence classification pipeline for prediction.
-
-        Args:
-            model (str or Model): Supply either a local model dir which supported the sequence classification task,
-            or a model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the first sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
-            param will have no effect.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='sentiment-classification',
-            >>>    model='damo/nlp_structbert_sentiment-classification_chinese-base')
-            >>> sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
-            >>> print(pipeline_ins(sentence1))
-            >>> # Or use the dict input:
-            >>> print(pipeline_ins({'first_sequence': sentence1}))
-
-            To view other examples plese check the tests/pipelines/test_sentiment-classification.py.
-        """
-        if preprocessor is None:
-            preprocessor = SingleSentenceClassificationPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
-                first_sequence=first_sequence,
-                sequence_length=kwargs.pop('sequence_length', 512))
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index aabf48d8..5367c1a8 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -49,7 +49,7 @@ class TokenClassificationPipeline(Pipeline):
         text = inputs.pop(OutputKeys.TEXT)
         with torch.no_grad():
             return {
-                **self.model(inputs, **forward_params), OutputKeys.TEXT: text
+                **self.model(**inputs, **forward_params), OutputKeys.TEXT: text
             }
 
     def postprocess(self, inputs: Dict[str, Any],
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index b4be1845..90303b65 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -16,17 +16,23 @@ if TYPE_CHECKING:
     from .kws import WavToLists
     from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
     from .nlp import (
-        Tokenize, SequenceClassificationPreprocessor,
-        TextGenerationPreprocessor, TokenClassificationPreprocessor,
-        SingleSentenceClassificationPreprocessor,
-        PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
-        ZeroShotClassificationPreprocessor, NERPreprocessor,
-        TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor,
-        SequenceLabelingPreprocessor, RelationExtractionPreprocessor,
-        DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor,
-        PassageRankingPreprocessor, SentenceEmbeddingPreprocessor,
+        DocumentSegmentationPreprocessor,
+        FaqQuestionAnsweringPreprocessor,
+        FillMaskPoNetPreprocessor,
+        NLPPreprocessor,
+        NLPTokenizerPreprocessorBase,
+        PassageRankingPreprocessor,
+        RelationExtractionPreprocessor,
+        SentenceEmbeddingPreprocessor,
+        SequenceClassificationPreprocessor,
+        TokenClassificationPreprocessor,
+        TextErrorCorrectionPreprocessor,
+        TextGenerationPreprocessor,
         Text2TextGenerationPreprocessor,
-        WordSegmentationBlankSetToLabelPreprocessor)
+        Tokenize,
+        WordSegmentationBlankSetToLabelPreprocessor,
+        ZeroShotClassificationPreprocessor,
+    )
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
@@ -49,18 +55,22 @@ else:
         'kws': ['WavToLists'],
         'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
         'nlp': [
-            'Tokenize', 'SequenceClassificationPreprocessor',
-            'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-            'SingleSentenceClassificationPreprocessor',
-            'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
-            'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-            'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
-            'TextErrorCorrectionPreprocessor',
-            'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
+            'DocumentSegmentationPreprocessor',
+            'FaqQuestionAnsweringPreprocessor',
+            'FillMaskPoNetPreprocessor',
+            'NLPPreprocessor',
+            'NLPTokenizerPreprocessorBase',
+            'PassageRankingPreprocessor',
             'RelationExtractionPreprocessor',
+            'SentenceEmbeddingPreprocessor',
+            'SequenceClassificationPreprocessor',
+            'TokenClassificationPreprocessor',
+            'TextErrorCorrectionPreprocessor',
+            'TextGenerationPreprocessor',
+            'Tokenize',
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
-            'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
+            'ZeroShotClassificationPreprocessor',
         ],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index 8e75ae98..dfbb5c81 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -6,32 +6,41 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .text_error_correction import TextErrorCorrectionPreprocessor
     from .nlp_base import (
-        Tokenize, SequenceClassificationPreprocessor,
-        TextGenerationPreprocessor, TokenClassificationPreprocessor,
-        SingleSentenceClassificationPreprocessor,
-        Text2TextGenerationPreprocessor,
-        PairSentenceClassificationPreprocessor, FillMaskPreprocessor,
-        ZeroShotClassificationPreprocessor, NERPreprocessor,
-        FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor,
-        RelationExtractionPreprocessor, DocumentSegmentationPreprocessor,
-        FillMaskPoNetPreprocessor, PassageRankingPreprocessor,
+        DocumentSegmentationPreprocessor,
+        FaqQuestionAnsweringPreprocessor,
+        FillMaskPoNetPreprocessor,
+        NLPPreprocessor,
+        NLPTokenizerPreprocessorBase,
+        PassageRankingPreprocessor,
+        RelationExtractionPreprocessor,
         SentenceEmbeddingPreprocessor,
-        WordSegmentationBlankSetToLabelPreprocessor)
+        SequenceClassificationPreprocessor,
+        TokenClassificationPreprocessor,
+        TextGenerationPreprocessor,
+        Text2TextGenerationPreprocessor,
+        Tokenize,
+        WordSegmentationBlankSetToLabelPreprocessor,
+        ZeroShotClassificationPreprocessor,
+    )
 
 else:
     _import_structure = {
         'nlp_base': [
-            'Tokenize', 'SequenceClassificationPreprocessor',
-            'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-            'SingleSentenceClassificationPreprocessor',
-            'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
-            'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-            'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor',
-            'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor',
+            'DocumentSegmentationPreprocessor',
+            'FaqQuestionAnsweringPreprocessor',
+            'FillMaskPoNetPreprocessor',
+            'NLPPreprocessor',
+            'NLPTokenizerPreprocessorBase',
+            'PassageRankingPreprocessor',
             'RelationExtractionPreprocessor',
+            'SentenceEmbeddingPreprocessor',
+            'SequenceClassificationPreprocessor',
+            'TokenClassificationPreprocessor',
+            'TextGenerationPreprocessor',
+            'Tokenize',
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
-            'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor'
+            'ZeroShotClassificationPreprocessor',
         ],
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index d6325eed..6b559de9 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -2,14 +2,13 @@
 
 import os.path as osp
 import re
-import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
-from transformers import AutoTokenizer, BertTokenizerFast
+import torch
+from transformers import AutoTokenizer
 
 from modelscope.metainfo import Models, Preprocessors
-from modelscope.models.nlp.structbert import SbertTokenizerFast
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
@@ -23,24 +22,21 @@ from modelscope.utils.type_assert import type_assert
 logger = get_logger()
 
 __all__ = [
-    'Tokenize',
-    'SequenceClassificationPreprocessor',
-    'TextGenerationPreprocessor',
-    'TokenClassificationPreprocessor',
-    'PairSentenceClassificationPreprocessor',
-    'Text2TextGenerationPreprocessor',
-    'SingleSentenceClassificationPreprocessor',
-    'FillMaskPreprocessor',
-    'ZeroShotClassificationPreprocessor',
-    'NERPreprocessor',
-    'SentenceEmbeddingPreprocessor',
-    'PassageRankingPreprocessor',
-    'FaqQuestionAnsweringPreprocessor',
-    'SequenceLabelingPreprocessor',
-    'RelationExtractionPreprocessor',
     'DocumentSegmentationPreprocessor',
+    'FaqQuestionAnsweringPreprocessor',
+    'NLPPreprocessor',
     'FillMaskPoNetPreprocessor',
+    'NLPTokenizerPreprocessorBase',
+    'PassageRankingPreprocessor',
+    'RelationExtractionPreprocessor',
+    'SentenceEmbeddingPreprocessor',
+    'SequenceClassificationPreprocessor',
+    'TokenClassificationPreprocessor',
+    'Text2TextGenerationPreprocessor',
+    'TextGenerationPreprocessor',
+    'Tokenize',
     'WordSegmentationBlankSetToLabelPreprocessor',
+    'ZeroShotClassificationPreprocessor',
 ]
 
 
@@ -48,85 +44,19 @@ __all__ = [
 class Tokenize(Preprocessor):
 
     def __init__(self, tokenizer_name) -> None:
-        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
     def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
         if isinstance(data, str):
             data = {InputFields.text: data}
-        token_dict = self._tokenizer(data[InputFields.text])
+        token_dict = self.tokenizer(data[InputFields.text])
         data.update(token_dict)
         return data
 
 
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
-class SequenceClassificationPreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        from easynlp.modelzoo import AutoTokenizer
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.sequence_length = kwargs.pop('sequence_length', 128)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
-        print(f'this is the tokenzier {self.tokenizer}')
-        self.label2id = parse_label_mapping(self.model_dir)
-
-    @type_assert(object, (str, tuple, Dict))
-    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
-        feature = super().__call__(data)
-        if isinstance(data, str):
-            new_data = {self.first_sequence: data}
-        elif isinstance(data, tuple):
-            sentence1, sentence2 = data
-            new_data = {
-                self.first_sequence: sentence1,
-                self.second_sequence: sentence2
-            }
-        else:
-            new_data = data
-
-        # preprocess the data for the model input
-
-        rst = {
-            'id': [],
-            'input_ids': [],
-            'attention_mask': [],
-            'token_type_ids': [],
-        }
-
-        max_seq_length = self.sequence_length
-
-        text_a = new_data[self.first_sequence]
-        text_b = new_data.get(self.second_sequence, None)
-
-        feature = self.tokenizer(
-            text_a,
-            text_b,
-            padding='max_length',
-            truncation=True,
-            max_length=max_seq_length)
-
-        rst['id'].append(new_data.get('id', str(uuid.uuid4())))
-        rst['input_ids'].append(feature['input_ids'])
-        rst['attention_mask'].append(feature['attention_mask'])
-        rst['token_type_ids'].append(feature['token_type_ids'])
-        return rst
-
-
 class NLPTokenizerPreprocessorBase(Preprocessor):
 
-    def __init__(self, model_dir: str, pair: bool, mode: str, **kwargs):
+    def __init__(self, model_dir: str, mode: str, **kwargs):
         """The NLP tokenizer preprocessor base class.
 
         Any nlp preprocessor which uses the hf tokenizer can inherit from this class.
@@ -138,7 +68,6 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             label: The label key
             label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
                 if this mapping is not supplied.
-            pair (bool): Pair sentence input or single sentence input.
             mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
             kwargs: These kwargs will be directly fed into the tokenizer.
         """
@@ -148,7 +77,8 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         self.first_sequence: str = kwargs.pop('first_sequence',
                                               'first_sequence')
         self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.pair = pair
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
         self._mode = mode
         self.label = kwargs.pop('label', OutputKeys.LABEL)
         self.label2id = None
@@ -158,6 +88,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             self.label2id = parse_label_mapping(self.model_dir)
 
         self.tokenize_kwargs = kwargs
+
         self.tokenizer = self.build_tokenizer(model_dir)
 
     @property
@@ -179,20 +110,38 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         @param model_dir:  The local model dir.
         @return: The initialized tokenizer.
         """
-
+        self.is_transformer_based_model = 'lstm' not in model_dir
+        # fast version lead to parallel inference failed
         model_type = get_model_type(model_dir)
         if model_type in (Models.structbert, Models.gpt3, Models.palm,
                           Models.plug):
-            from modelscope.models.nlp.structbert import SbertTokenizer
-            return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
+            from modelscope.models.nlp.structbert import SbertTokenizer, SbertTokenizerFast
+            return SbertTokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else SbertTokenizerFast.from_pretrained(
+                model_dir)
         elif model_type == Models.veco:
-            from modelscope.models.nlp.veco import VecoTokenizer
-            return VecoTokenizer.from_pretrained(model_dir)
+            from modelscope.models.nlp.veco import VecoTokenizer, VecoTokenizerFast
+            return VecoTokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else VecoTokenizerFast.from_pretrained(
+                model_dir)
         elif model_type == Models.deberta_v2:
-            from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer
-            return DebertaV2Tokenizer.from_pretrained(model_dir)
+            from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast
+            return DebertaV2Tokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else DebertaV2TokenizerFast.from_pretrained(
+                model_dir)
+        elif not self.is_transformer_based_model:
+            from transformers import BertTokenizer, BertTokenizerFast
+            return BertTokenizer.from_pretrained(
+                model_dir
+            ) if self._mode == ModeKeys.INFERENCE else BertTokenizerFast.from_pretrained(
+                model_dir)
         else:
-            return AutoTokenizer.from_pretrained(model_dir, use_fast=False)
+            return AutoTokenizer.from_pretrained(
+                model_dir,
+                use_fast=False if self._mode == ModeKeys.INFERENCE else True)
 
     def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
         """process the raw input data
@@ -239,7 +188,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             if len(data) == 3:
                 text_a, text_b, labels = data
             elif len(data) == 2:
-                if self.pair:
+                if self._mode == ModeKeys.INFERENCE:
                     text_a, text_b = data
                 else:
                     text_a, labels = data
@@ -277,6 +226,22 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
                 output[OutputKeys.LABELS] = labels
 
 
+@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.feature_extraction)
+class NLPPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.passage_ranking)
 class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase):
@@ -337,22 +302,12 @@ class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase):
     Fields.nlp, module_name=Preprocessors.nli_tokenizer)
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
-class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in pair sentence classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=True, mode=mode, **kwargs)
-
-
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
-class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in single sentence classification.
+class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sequence classification.
     """
 
     def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
@@ -360,7 +315,7 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         kwargs['padding'] = kwargs.get(
             'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
 
 @PREPROCESSORS.register_module(
@@ -421,7 +376,7 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
             model_dir (str): model path
         """
         self.sequence_length = kwargs.pop('sequence_length', 512)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
     def __call__(self, data: Union[str, Dict], hypothesis_template: str,
                  candidate_labels: list) -> Dict[str, Any]:
@@ -496,14 +451,12 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
                  tokenizer=None,
                  mode=ModeKeys.INFERENCE,
                  **kwargs):
-        self.tokenizer = self.build_tokenizer(
-            model_dir) if tokenizer is None else tokenizer
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get('padding', 'max_length')
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      False)
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
     @staticmethod
     def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
@@ -541,20 +494,6 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
         }
 
 
-@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
-class FillMaskPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in MLM task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     True)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-
 @PREPROCESSORS.register_module(
     Fields.nlp,
     module_name=Preprocessors.word_segment_text_to_label_preprocessor)
@@ -592,21 +531,40 @@ class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor):
         }
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
 class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in normal token classification task.
+    """The tokenizer preprocessor used in normal NER task.
     """
 
     def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
         kwargs['truncation'] = kwargs.get('truncation', True)
         kwargs['padding'] = kwargs.get(
             'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
         self.label_all_tokens = kwargs.pop('label_all_tokens', False)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
-    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
+        if 'is_split_into_words' in kwargs:
+            self.is_split_into_words = kwargs.pop('is_split_into_words')
+        else:
+            self.is_split_into_words = self.tokenizer.init_kwargs.get(
+                'is_split_into_words', False)
+        if 'label2id' in kwargs:
+            kwargs.pop('label2id')
+        self.tokenize_kwargs = kwargs
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
@@ -618,23 +576,84 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
             Dict[str, Any]: the preprocessed data
         """
 
-        text_a = None
+        # preprocess the data for the model input
+        text = None
         labels_list = None
         if isinstance(data, str):
-            text_a = data
+            text = data
         elif isinstance(data, dict):
-            text_a = data.get(self.first_sequence)
+            text = data.get(self.first_sequence)
             labels_list = data.get(self.label)
 
-        if isinstance(text_a, str):
-            text_a = text_a.replace(' ', '').strip()
+        input_ids = []
+        label_mask = []
+        offset_mapping = []
+        if self.is_split_into_words:
+            for offset, token in enumerate(list(data)):
+                subtoken_ids = self.tokenizer.encode(
+                    token, add_special_tokens=False)
+                if len(subtoken_ids) == 0:
+                    subtoken_ids = [self.tokenizer.unk_token_id]
+                input_ids.extend(subtoken_ids)
+                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
+                offset_mapping.extend([(offset, offset + 1)])
+        else:
+            if self.tokenizer.is_fast:
+                encodings = self.tokenizer(
+                    text,
+                    add_special_tokens=False,
+                    return_offsets_mapping=True,
+                    **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                word_ids = encodings.word_ids()
+                for i in range(len(word_ids)):
+                    if word_ids[i] is None:
+                        label_mask.append(0)
+                    elif word_ids[i] == word_ids[i - 1]:
+                        label_mask.append(0)
+                        offset_mapping[-1] = (
+                            offset_mapping[-1][0],
+                            encodings['offset_mapping'][i][1])
+                    else:
+                        label_mask.append(1)
+                        offset_mapping.append(encodings['offset_mapping'][i])
+            else:
+                encodings = self.tokenizer(
+                    text, add_special_tokens=False, **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
+                    text)
 
-        tokenized_inputs = self.tokenizer(
-            [t for t in text_a],
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            is_split_into_words=True,
-            **self.tokenize_kwargs)
+        if len(input_ids) >= self.sequence_length - 2:
+            input_ids = input_ids[:self.sequence_length - 2]
+            label_mask = label_mask[:self.sequence_length - 2]
+        input_ids = [self.tokenizer.cls_token_id
+                     ] + input_ids + [self.tokenizer.sep_token_id]
+        label_mask = [0] + label_mask + [0]
+        attention_mask = [1] * len(input_ids)
+        offset_mapping = offset_mapping[:sum(label_mask)]
 
+        if not self.is_transformer_based_model:
+            input_ids = input_ids[1:-1]
+            attention_mask = attention_mask[1:-1]
+            label_mask = label_mask[1:-1]
+
+        if self._mode == ModeKeys.INFERENCE:
+            input_ids = torch.tensor(input_ids).unsqueeze(0)
+            attention_mask = torch.tensor(attention_mask).unsqueeze(0)
+            label_mask = torch.tensor(
+                label_mask, dtype=torch.bool).unsqueeze(0)
+
+        # the token classification
+        output = {
+            'text': text,
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
+            'offset_mapping': offset_mapping
+        }
+
+        # align the labels with tokenized text
         if labels_list is not None:
             assert self.label2id is not None
             # Map that sends B-Xxx label to its I-Xxx counterpart
@@ -653,7 +672,6 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                     b_to_i_label.append(idx)
 
             label_row = [self.label2id[lb] for lb in labels_list]
-            word_ids = tokenized_inputs.word_ids()
             previous_word_idx = None
             label_ids = []
             for word_idx in word_ids:
@@ -668,229 +686,66 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
                         label_ids.append(-100)
                 previous_word_idx = word_idx
             labels = label_ids
-            tokenized_inputs['labels'] = labels
-            # new code end
+            output['labels'] = labels
+        return output
 
-        if self._mode == ModeKeys.INFERENCE:
-            tokenized_inputs[OutputKeys.TEXT] = text_a
-        return tokenized_inputs
+    def get_tokenizer_class(self):
+        tokenizer_class = self.tokenizer.__class__.__name__
+        if tokenizer_class.endswith(
+                'Fast') and tokenizer_class != 'PreTrainedTokenizerFast':
+            tokenizer_class = tokenizer_class[:-4]
+        return tokenizer_class
 
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
-class NERPreprocessor(Preprocessor):
-    """The tokenizer preprocessor used in normal NER task.
-
-    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        self.model_dir: str = model_dir
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        self.is_transformer_based_model = 'lstm' not in model_dir
-        if self.is_transformer_based_model:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_dir, use_fast=True)
-        else:
-            self.tokenizer = BertTokenizerFast.from_pretrained(
-                model_dir, use_fast=True)
-        self.is_split_into_words = self.tokenizer.init_kwargs.get(
-            'is_split_into_words', False)
-
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        text = data
-        if self.is_split_into_words:
-            input_ids = []
-            label_mask = []
-            offset_mapping = []
-            for offset, token in enumerate(list(data)):
-                subtoken_ids = self.tokenizer.encode(
-                    token, add_special_tokens=False)
-                if len(subtoken_ids) == 0:
-                    subtoken_ids = [self.tokenizer.unk_token_id]
-                input_ids.extend(subtoken_ids)
-                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
-                offset_mapping.extend([(offset, offset + 1)]
-                                      + [(offset + 1, offset + 1)]
-                                      * (len(subtoken_ids) - 1))
-            if len(input_ids) >= self.sequence_length - 2:
-                input_ids = input_ids[:self.sequence_length - 2]
-                label_mask = label_mask[:self.sequence_length - 2]
-                offset_mapping = offset_mapping[:self.sequence_length - 2]
-            input_ids = [self.tokenizer.cls_token_id
-                         ] + input_ids + [self.tokenizer.sep_token_id]
-            label_mask = [0] + label_mask + [0]
-            attention_mask = [1] * len(input_ids)
-        else:
-            encodings = self.tokenizer(
-                text,
-                add_special_tokens=True,
-                padding=True,
-                truncation=True,
-                max_length=self.sequence_length,
-                return_offsets_mapping=True)
-            input_ids = encodings['input_ids']
-            attention_mask = encodings['attention_mask']
-            word_ids = encodings.word_ids()
-            label_mask = []
-            offset_mapping = []
-            for i in range(len(word_ids)):
-                if word_ids[i] is None:
-                    label_mask.append(0)
-                elif word_ids[i] == word_ids[i - 1]:
-                    label_mask.append(0)
-                    offset_mapping[-1] = (offset_mapping[-1][0],
-                                          encodings['offset_mapping'][i][1])
+    def get_label_mask_and_offset_mapping(self, text):
+        label_mask = []
+        offset_mapping = []
+        tokens = self.tokenizer.tokenize(text)
+        offset = 0
+        if self.get_tokenizer_class() == 'BertTokenizer':
+            for token in tokens:
+                is_start = (token[:2] != '##')
+                if is_start:
+                    label_mask.append(True)
                 else:
-                    label_mask.append(1)
-                    offset_mapping.append(encodings['offset_mapping'][i])
-
-        if not self.is_transformer_based_model:
-            input_ids = input_ids[1:-1]
-            attention_mask = attention_mask[1:-1]
-            label_mask = label_mask[1:-1]
-        return {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-            'offset_mapping': offset_mapping
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
-class SequenceLabelingPreprocessor(Preprocessor):
-    """The tokenizer preprocessor used in normal NER task.
-
-    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        self.model_dir: str = model_dir
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-
-        if 'lstm' in model_dir or 'gcnn' in model_dir:
-            self.tokenizer = BertTokenizerFast.from_pretrained(
-                model_dir, use_fast=False)
-        elif 'structbert' in model_dir:
-            self.tokenizer = SbertTokenizerFast.from_pretrained(
-                model_dir, use_fast=False)
-        else:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_dir, use_fast=False)
-        self.is_split_into_words = self.tokenizer.init_kwargs.get(
-            'is_split_into_words', False)
-
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        text = data
-        if self.is_split_into_words:
-            input_ids = []
-            label_mask = []
-            offset_mapping = []
-            for offset, token in enumerate(list(data)):
-                subtoken_ids = self.tokenizer.encode(
-                    token, add_special_tokens=False)
-                if len(subtoken_ids) == 0:
-                    subtoken_ids = [self.tokenizer.unk_token_id]
-                input_ids.extend(subtoken_ids)
-                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
-                offset_mapping.extend([(offset, offset + 1)]
-                                      + [(offset + 1, offset + 1)]
-                                      * (len(subtoken_ids) - 1))
-            if len(input_ids) >= self.sequence_length - 2:
-                input_ids = input_ids[:self.sequence_length - 2]
-                label_mask = label_mask[:self.sequence_length - 2]
-                offset_mapping = offset_mapping[:self.sequence_length - 2]
-            input_ids = [self.tokenizer.cls_token_id
-                         ] + input_ids + [self.tokenizer.sep_token_id]
-            label_mask = [0] + label_mask + [0]
-            attention_mask = [1] * len(input_ids)
-        else:
-            encodings = self.tokenizer(
-                text,
-                add_special_tokens=True,
-                padding=True,
-                truncation=True,
-                max_length=self.sequence_length,
-                return_offsets_mapping=True)
-            input_ids = encodings['input_ids']
-            attention_mask = encodings['attention_mask']
-            word_ids = encodings.word_ids()
-            label_mask = []
-            offset_mapping = []
-            for i in range(len(word_ids)):
-                if word_ids[i] is None:
-                    label_mask.append(0)
-                elif word_ids[i] == word_ids[i - 1]:
-                    label_mask.append(0)
-                    offset_mapping[-1] = (offset_mapping[-1][0],
-                                          encodings['offset_mapping'][i][1])
+                    token = token[2:]
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if is_start:
+                    offset_mapping.append((start, end))
                 else:
-                    label_mask.append(1)
-                    offset_mapping.append(encodings['offset_mapping'][i])
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+        elif self.get_tokenizer_class() == 'XLMRobertaTokenizer':
+            last_is_blank = False
+            for token in tokens:
+                is_start = (token[0] == '▁')
+                if is_start:
+                    token = token[1:]
+                    label_mask.append(True)
+                    if len(token) == 0:
+                        last_is_blank = True
+                        continue
+                else:
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if last_is_blank or is_start:
+                    offset_mapping.append((start, end))
+                else:
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+                last_is_blank = False
+        else:
+            raise NotImplementedError
 
-        if not self.is_transformer_based_model:
-            input_ids = input_ids[1:-1]
-            attention_mask = attention_mask[1:-1]
-            label_mask = label_mask[1:-1]
-        return {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-            'offset_mapping': offset_mapping
-        }
+        return label_mask, offset_mapping
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.re_tokenizer)
 class RelationExtractionPreprocessor(Preprocessor):
-    """The tokenizer preprocessor used in normal RE task.
-
-    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
+    """The relation extraction preprocessor used in normal RE task.
     """
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -937,7 +792,7 @@ class FaqQuestionAnsweringPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         super(FaqQuestionAnsweringPreprocessor, self).__init__(
-            model_dir, pair=False, mode=ModeKeys.INFERENCE, **kwargs)
+            model_dir, mode=ModeKeys.INFERENCE, **kwargs)
         import os
         from transformers import BertTokenizer
 
@@ -1026,7 +881,7 @@ class DocumentSegmentationPreprocessor(Preprocessor):
         """
 
         super().__init__(*args, **kwargs)
-
+        from transformers import BertTokenizerFast
         self.tokenizer = BertTokenizerFast.from_pretrained(
             model_dir,
             use_fast=True,
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 75add1d9..b19c0fce 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -115,6 +115,7 @@ class NLPTasks(object):
     conversational_text_to_sql = 'conversational-text-to-sql'
     information_extraction = 'information-extraction'
     document_segmentation = 'document-segmentation'
+    feature_extraction = 'feature-extraction'
 
 
 class AudioTasks(object):
diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index 3cf88114..7a9c79e2 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -74,7 +74,6 @@ class Registry(object):
             raise KeyError(f'{module_name} is already registered in '
                            f'{self._name}[{group_key}]')
         self._modules[group_key][module_name] = module_cls
-        module_cls.group_key = group_key
 
     def register_module(self,
                         group_key: str = default_group,
@@ -196,6 +195,7 @@ def build_from_cfg(cfg,
         if obj_cls is None:
             raise KeyError(f'{obj_type} is not in the {registry.name}'
                            f' registry group {group_key}')
+        obj_cls.group_key = group_key
     elif inspect.isclass(obj_type) or inspect.isfunction(obj_type):
         obj_cls = obj_type
     else:
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 762530f4..91a3b5c5 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -75,7 +75,8 @@ class MsDatasetTest(unittest.TestCase):
         preprocessor = SequenceClassificationPreprocessor(
             nlp_model.model_dir,
             first_sequence='premise',
-            second_sequence=None)
+            second_sequence=None,
+            padding='max_length')
         ms_ds_train = MsDataset.load(
             'xcopa',
             subset_name='translation-et',
diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py
index 4f3206cd..549d2cb3 100644
--- a/tests/pipelines/test_deberta_tasks.py
+++ b/tests/pipelines/test_deberta_tasks.py
@@ -6,11 +6,9 @@ import torch
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import DebertaV2ForMaskedLM
-from modelscope.models.nlp.deberta_v2 import (DebertaV2Tokenizer,
-                                              DebertaV2TokenizerFast)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
-from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.preprocessors import NLPPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -24,7 +22,7 @@ class DeBERTaV2TaskTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         model_dir = snapshot_download(self.model_id_deberta)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
         model = DebertaV2ForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -40,7 +38,7 @@ class DeBERTaV2TaskTest(unittest.TestCase):
         # sbert
         print(self.model_id_deberta)
         model = Model.from_pretrained(self.model_id_deberta)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model.model_dir, first_sequence='sentence', second_sequence=None)
         pipeline_ins = pipeline(
             task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
diff --git a/tests/pipelines/test_feature_extraction.py b/tests/pipelines/test_feature_extraction.py
new file mode 100644
index 00000000..39291e76
--- /dev/null
+++ b/tests/pipelines/test_feature_extraction.py
@@ -0,0 +1,67 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import FeatureExtractionModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FeatureExtractionPipeline
+from modelscope.preprocessors import NLPPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class FeatureExtractionTaskModelTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.feature_extraction
+        self.model_id = 'damo/pert_feature-extraction_base-test'
+
+    sentence1 = '测试embedding'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = NLPPreprocessor(cache_path, padding=False)
+        model = FeatureExtractionModel.from_pretrained(self.model_id)
+        pipeline1 = FeatureExtractionPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.feature_extraction, model=model, preprocessor=tokenizer)
+        result = pipeline1(input=self.sentence1)
+
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{np.shape(result[OutputKeys.TEXT_EMBEDDING])}')
+        result = pipeline2(input=self.sentence1)
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1: {np.shape(result[OutputKeys.TEXT_EMBEDDING])}')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = NLPPreprocessor(model.model_dir, padding=False)
+        pipeline_ins = pipeline(
+            task=Tasks.feature_extraction, model=model, preprocessor=tokenizer)
+        result = pipeline_ins(input=self.sentence1)
+        print(np.shape(result[OutputKeys.TEXT_EMBEDDING]))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.feature_extraction, model=self.model_id)
+        result = pipeline_ins(input=self.sentence1)
+        print(np.shape(result[OutputKeys.TEXT_EMBEDDING]))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.feature_extraction)
+        result = pipeline_ins(input=self.sentence1)
+        print(np.shape(result[OutputKeys.TEXT_EMBEDDING]))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index cec8966f..0e5e242b 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -1,13 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 
+from regex import R
+
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM,
                                    VecoForMaskedLM)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
-from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.preprocessors import NLPPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
@@ -51,7 +53,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
         # sbert
         for language in ['zh']:
             model_dir = snapshot_download(self.model_id_sbert[language])
-            preprocessor = FillMaskPreprocessor(
+            preprocessor = NLPPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
             model = StructBertForMaskedLM.from_pretrained(model_dir)
             pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -66,7 +68,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
 
         # veco
         model_dir = snapshot_download(self.model_id_veco)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
         model = VecoForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
@@ -80,13 +82,28 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
                 f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
             )
 
+        # bert
+        language = 'zh'
+        model_dir = snapshot_download(self.model_id_bert, revision='beta')
+        preprocessor = NLPPreprocessor(
+            model_dir, first_sequence='sentence', second_sequence=None)
+        model = Model.from_pretrained(model_dir)
+        pipeline1 = FillMaskPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        ori_text = self.ori_texts[language]
+        test_input = self.test_inputs[language]
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
+              f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
+
         # sbert
         for language in ['zh']:
             print(self.model_id_sbert[language])
             model = Model.from_pretrained(self.model_id_sbert[language])
-            preprocessor = FillMaskPreprocessor(
+            preprocessor = NLPPreprocessor(
                 model.model_dir,
                 first_sequence='sentence',
                 second_sequence=None)
@@ -100,7 +117,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
 
         # veco
         model = Model.from_pretrained(self.model_id_veco)
-        preprocessor = FillMaskPreprocessor(
+        preprocessor = NLPPreprocessor(
             model.model_dir, first_sequence='sentence', second_sequence=None)
         pipeline_ins = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -113,6 +130,18 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
                     f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
                     f'{pipeline_ins(test_input)}\n')
 
+        # bert
+        language = 'zh'
+        model = Model.from_pretrained(self.model_id_bert, revision='beta')
+        preprocessor = NLPPreprocessor(
+            model.model_dir, first_sequence='sentence', second_sequence=None)
+        pipeline_ins = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        pipeline_ins.model, f'fill_mask_bert_{language}'
+        print(
+            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+            f'{pipeline_ins(self.test_inputs[language])}\n')
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         # veco
@@ -131,6 +160,16 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
             f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
             f'{pipeline_ins(self.test_inputs[language])}\n')
 
+        # Bert
+        language = 'zh'
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask,
+            model=self.model_id_bert,
+            model_revision='beta')
+        print(
+            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+            f'{pipeline_ins(self.test_inputs[language])}\n')
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.fill_mask)
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 9fae2d09..3658cf3f 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -7,7 +7,7 @@ from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
                                    TransformerCRFForNamedEntityRecognition)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
-from modelscope.preprocessors import NERPreprocessor
+from modelscope.preprocessors import TokenClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -26,7 +26,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_tcrf_by_direct_model_download(self):
         cache_path = snapshot_download(self.tcrf_model_id)
-        tokenizer = NERPreprocessor(cache_path)
+        tokenizer = TokenClassificationPreprocessor(cache_path)
         model = TransformerCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
         pipeline1 = NamedEntityRecognitionPipeline(
@@ -43,7 +43,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_lcrf_by_direct_model_download(self):
         cache_path = snapshot_download(self.lcrf_model_id)
-        tokenizer = NERPreprocessor(cache_path)
+        tokenizer = TokenClassificationPreprocessor(cache_path)
         model = LSTMCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
         pipeline1 = NamedEntityRecognitionPipeline(
@@ -60,7 +60,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_tcrf_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.tcrf_model_id)
-        tokenizer = NERPreprocessor(model.model_dir)
+        tokenizer = TokenClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
             model=model,
@@ -70,7 +70,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_lcrf_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.lcrf_model_id)
-        tokenizer = NERPreprocessor(model.model_dir)
+        tokenizer = TokenClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
             model=model,
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index a53ac3b3..db4b9912 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
-from modelscope.preprocessors import PairSentenceClassificationPreprocessor
+from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
@@ -26,9 +26,9 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = PairSentenceClassificationPipeline(
+        pipeline1 = SequenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
@@ -40,7 +40,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.nli, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 4079455d..288d38c7 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
-from modelscope.preprocessors import PairSentenceClassificationPreprocessor
+from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.regress_test_utils import MsRegressTool
@@ -26,9 +26,9 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = PairSentenceClassificationPipeline(
+        pipeline1 = SequenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
@@ -43,7 +43,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity,
             model=model,
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index 3db9971a..d0b1b40f 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -6,8 +6,8 @@ from modelscope.models import Model
 from modelscope.models.nlp.task_models.sequence_classification import \
     SequenceClassificationModel
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline
-from modelscope.preprocessors import SingleSentenceClassificationPreprocessor
+from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
@@ -17,23 +17,21 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
                                            DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.task = Tasks.sentiment_classification
+        self.task = Tasks.text_classification
         self.model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
 
     sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
-        cache_path = snapshot_download(self.model_id)
-        tokenizer = SingleSentenceClassificationPreprocessor(cache_path)
+        cache_path = snapshot_download(self.model_id, revision='beta')
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SequenceClassificationModel.from_pretrained(
-            self.model_id, num_labels=2)
-        pipeline1 = SingleSentenceClassificationPipeline(
+            self.model_id, num_labels=2, revision='beta')
+        pipeline1 = SequenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
-            Tasks.sentiment_classification,
-            model=model,
-            preprocessor=tokenizer)
+            Tasks.text_classification, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1:{pipeline1(input=self.sentence1)}')
         print(f'sentence1: {self.sentence1}\n'
@@ -41,10 +39,10 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
-        tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir)
+        model = Model.from_pretrained(self.model_id, revision='beta')
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification,
+            task=Tasks.text_classification,
             model=model,
             preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence1))
@@ -54,14 +52,17 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification, model=self.model_id)
+            task=Tasks.text_classification,
+            model=self.model_id,
+            model_revision='beta')
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(task=Tasks.sentiment_classification)
+        pipeline_ins = pipeline(
+            task=Tasks.text_classification, model_revision='beta')
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index 71b9f3e2..39dbac99 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -12,6 +12,7 @@ from modelscope.utils.test_utils import test_level
 
 
 class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+    sentence1 = 'i like this wonderful place'
 
     def setUp(self) -> None:
         self.model_id = 'damo/bert-base-sst2'
@@ -46,7 +47,8 @@ class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
             task=Tasks.text_classification,
             model=model,
             preprocessor=preprocessor)
-        self.predict(pipeline_ins)
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{pipeline_ins(input=self.sentence1)}')
 
     # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     @unittest.skip('nlp model does not support tensor input, skipped')
diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py
index 4271e201..f9f4d93f 100644
--- a/tests/preprocessors/test_nlp.py
+++ b/tests/preprocessors/test_nlp.py
@@ -32,6 +32,82 @@ class NLPPreprocessorTest(unittest.TestCase):
             output['attention_mask'],
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 
+    def test_token_classification_tokenize(self):
+        with self.subTest(tokenizer_type='bert'):
+            cfg = dict(
+                type='token-cls-tokenizer',
+                model_dir='bert-base-cased',
+                label2id={
+                    'O': 0,
+                    'B': 1,
+                    'I': 2
+                })
+            preprocessor = build_preprocessor(cfg, Fields.nlp)
+            input = 'Do not meddle in the affairs of wizards, ' \
+                    'for they are subtle and quick to anger.'
+            output = preprocessor(input)
+            self.assertTrue(InputFields.text in output)
+            self.assertEqual(output['input_ids'].tolist()[0], [
+                101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678,
+                1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470,
+                119, 102
+            ])
+            self.assertEqual(output['attention_mask'].tolist()[0], [
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                1
+            ])
+            self.assertEqual(output['label_mask'].tolist()[0], [
+                False, True, True, True, False, True, True, True, True, True,
+                False, True, True, True, True, True, True, True, True, True,
+                True, False
+            ])
+            self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6),
+                                                        (7, 13), (14, 16),
+                                                        (17, 20), (21, 28),
+                                                        (29, 31), (32, 39),
+                                                        (39, 40), (41, 44),
+                                                        (45, 49), (50, 53),
+                                                        (54, 60), (61, 64),
+                                                        (65, 70), (71, 73),
+                                                        (74, 79), (79, 80)])
+
+        with self.subTest(tokenizer_type='roberta'):
+            cfg = dict(
+                type='token-cls-tokenizer',
+                model_dir='xlm-roberta-base',
+                label2id={
+                    'O': 0,
+                    'B': 1,
+                    'I': 2
+                })
+            preprocessor = build_preprocessor(cfg, Fields.nlp)
+            input = 'Do not meddle in the affairs of wizards, ' \
+                    'for they are subtle and quick to anger.'
+            output = preprocessor(input)
+            self.assertTrue(InputFields.text in output)
+            self.assertEqual(output['input_ids'].tolist()[0], [
+                0, 984, 959, 128, 19298, 23, 70, 103086, 7, 111, 6, 44239,
+                99397, 4, 100, 1836, 621, 1614, 17991, 136, 63773, 47, 348, 56,
+                5, 2
+            ])
+            self.assertEqual(output['attention_mask'].tolist()[0], [
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                1, 1, 1, 1, 1
+            ])
+            self.assertEqual(output['label_mask'].tolist()[0], [
+                False, True, True, True, False, True, True, True, False, True,
+                True, False, False, False, True, True, True, True, False, True,
+                True, True, True, False, False, False
+            ])
+            self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6),
+                                                        (7, 13), (14, 16),
+                                                        (17, 20), (21, 28),
+                                                        (29, 31), (32, 40),
+                                                        (41, 44), (45, 49),
+                                                        (50, 53), (54, 60),
+                                                        (61, 64), (65, 70),
+                                                        (71, 73), (74, 80)])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index de99a7b8..9a8ab828 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -30,7 +30,7 @@ class AstScaningTest(unittest.TestCase):
     def test_ast_scaning_class(self):
         astScaner = AstScaning()
         pipeline_file = os.path.join(MODELSCOPE_PATH, 'pipelines', 'nlp',
-                                     'sequence_classification_pipeline.py')
+                                     'text_generation_pipeline.py')
         output = astScaner.generate_ast(pipeline_file)
         self.assertTrue(output['imports'] is not None)
         self.assertTrue(output['from_imports'] is not None)
@@ -40,14 +40,12 @@ class AstScaningTest(unittest.TestCase):
         self.assertIsInstance(imports, dict)
         self.assertIsInstance(from_imports, dict)
         self.assertIsInstance(decorators, list)
-        self.assertListEqual(
-            list(set(imports.keys()) - set(['typing', 'numpy'])), [])
-        self.assertEqual(len(from_imports.keys()), 9)
+        self.assertListEqual(list(set(imports.keys()) - set(['torch'])), [])
+        self.assertEqual(len(from_imports.keys()), 7)
         self.assertTrue(from_imports['modelscope.metainfo'] is not None)
         self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines'])
-        self.assertEqual(
-            decorators,
-            [('PIPELINES', 'text-classification', 'sentiment-analysis')])
+        self.assertEqual(decorators,
+                         [('PIPELINES', 'text-generation', 'text-generation')])
 
     def test_files_scaning_method(self):
         fileScaner = FilesAstScaning()

From 91231b3c157ac875f67e2bbd420a8810da0c0e36 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:09:13 +0800
Subject: [PATCH 18/23] [to #42322933]add copyright on
 mogface,retinaface,mtcnn,ulfd pipeline         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10266086

---
 modelscope/pipelines/cv/mog_face_detection_pipeline.py    | 1 +
 modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py  | 1 +
 modelscope/pipelines/cv/retina_face_detection_pipeline.py | 1 +
 modelscope/pipelines/cv/ulfd_face_detection_pipeline.py   | 1 +
 4 files changed, 4 insertions(+)

diff --git a/modelscope/pipelines/cv/mog_face_detection_pipeline.py b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
index 8797ad12..124b605b 100644
--- a/modelscope/pipelines/cv/mog_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/mog_face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
index 57bf9920..bda46a70 100644
--- a/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
index b8c64405..40f2336a 100644
--- a/modelscope/pipelines/cv/retina_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
index 1263082b..e9901d64 100644
--- a/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 

From 3d41d6d6208edfcdb7cf7c00c571e0579405cde7 Mon Sep 17 00:00:00 2001
From: "tianchu.gtc" <tianchu.gtc@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:22:46 +0800
Subject: [PATCH 19/23] [to #42322933] fix seg4demo         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10189886

---
 .../image_panoptic_segmentation/panseg_model.py |  3 +--
 .../pan_merge/__init__.py                       |  1 +
 .../pan_merge/maskformer_semantic_head.py       |  1 +
 .../semantic_seg_model.py                       |  1 +
 .../vit_adapter/__init__.py                     |  2 ++
 .../vit_adapter/models/__init__.py              |  2 ++
 .../vit_adapter/models/backbone/__init__.py     |  2 ++
 .../models/backbone/adapter_modules.py          | 17 ++++++++---------
 .../models/backbone/base/__init__.py            |  2 ++
 .../vit_adapter/models/backbone/base/beit.py    |  6 ++----
 .../vit_adapter/models/backbone/beit_adapter.py | 13 ++++++-------
 .../vit_adapter/models/decode_heads/__init__.py |  2 ++
 .../models/decode_heads/base_decode_head.py     |  5 ++---
 .../decode_heads/mask2former_head_from_mmseg.py |  5 ++---
 .../vit_adapter/models/segmentors/__init__.py   |  2 ++
 .../models/segmentors/base_segmentor.py         |  5 ++---
 .../segmentors/encoder_decoder_mask2former.py   |  5 ++---
 .../vit_adapter/utils/__init__.py               |  2 ++
 .../vit_adapter/utils/builder.py                |  5 ++---
 .../vit_adapter/utils/seg_func.py               |  5 ++---
 .../cv/image_panoptic_segmentation_pipeline.py  | 16 +++++++---------
 .../cv/image_semantic_segmentation_pipeline.py  | 17 ++++++-----------
 22 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
index f9022f90..f44c01e8 100644
--- a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
+++ b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import torch
@@ -49,6 +50,4 @@ class SwinLPanopticSegmentation(TorchModel):
         return results
 
     def forward(self, Inputs):
-        import pdb
-        pdb.set_trace()
         return self.model(**Inputs)
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
index 2a75f318..6a31a308 100644
--- a/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .maskformer_semantic_head import MaskFormerSemanticHead
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
index 6769ebaf..2f3364d0 100644
--- a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn.functional as F
 from mmdet.models.builder import HEADS
diff --git a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
index 60acf28f..2b38ebad 100644
--- a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
+++ b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import numpy as np
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
index 82eec1c6..3b9a301c 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .models import backbone, decode_heads, segmentors
 from .utils import (ResizeToMultiple, add_prefix, build_pixel_sampler,
                     seg_resize)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
index ae5c5acf..791dd26f 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .backbone import BASEBEiT, BEiTAdapter
 from .decode_heads import Mask2FormerHeadFromMMSeg
 from .segmentors import EncoderDecoderMask2Former
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
index ab4258c1..7abd0ef1 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .base import BASEBEiT
 from .beit_adapter import BEiTAdapter
 
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
index 03080342..cf30cca0 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 
 import logging
 from functools import partial
@@ -417,7 +416,7 @@ class SpatialPriorModule(nn.Module):
         self.stem = nn.Sequential(*[
             nn.Conv2d(
                 3, inplanes, kernel_size=3, stride=2, padding=1, bias=False),
-            nn.SyncBatchNorm(inplanes),
+            nn.BatchNorm2d(inplanes),
             nn.ReLU(inplace=True),
             nn.Conv2d(
                 inplanes,
@@ -426,7 +425,7 @@ class SpatialPriorModule(nn.Module):
                 stride=1,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(inplanes),
+            nn.BatchNorm2d(inplanes),
             nn.ReLU(inplace=True),
             nn.Conv2d(
                 inplanes,
@@ -435,7 +434,7 @@ class SpatialPriorModule(nn.Module):
                 stride=1,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(inplanes),
+            nn.BatchNorm2d(inplanes),
             nn.ReLU(inplace=True),
             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
         ])
@@ -447,7 +446,7 @@ class SpatialPriorModule(nn.Module):
                 stride=2,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(2 * inplanes),
+            nn.BatchNorm2d(2 * inplanes),
             nn.ReLU(inplace=True)
         ])
         self.conv3 = nn.Sequential(*[
@@ -458,7 +457,7 @@ class SpatialPriorModule(nn.Module):
                 stride=2,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(4 * inplanes),
+            nn.BatchNorm2d(4 * inplanes),
             nn.ReLU(inplace=True)
         ])
         self.conv4 = nn.Sequential(*[
@@ -469,7 +468,7 @@ class SpatialPriorModule(nn.Module):
                 stride=2,
                 padding=1,
                 bias=False),
-            nn.SyncBatchNorm(4 * inplanes),
+            nn.BatchNorm2d(4 * inplanes),
             nn.ReLU(inplace=True)
         ])
         self.fc1 = nn.Conv2d(
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
index 40b0fa89..5b33031f 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .beit import BASEBEiT
 
 __all__ = ['BASEBEiT']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
index a5811fb9..62f873ec 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
@@ -1,7 +1,5 @@
-# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
-# Github source: https://github.com/microsoft/unilm/tree/master/beit
-# This implementation refers to
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 import math
 from functools import partial
 
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
index 02a4968e..182fc0c1 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 import logging
 import math
 
@@ -69,10 +68,10 @@ class BEiTAdapter(BASEBEiT):
         ])
 
         self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2)
-        self.norm1 = nn.SyncBatchNorm(embed_dim)
-        self.norm2 = nn.SyncBatchNorm(embed_dim)
-        self.norm3 = nn.SyncBatchNorm(embed_dim)
-        self.norm4 = nn.SyncBatchNorm(embed_dim)
+        self.norm1 = nn.BatchNorm2d(embed_dim)
+        self.norm2 = nn.BatchNorm2d(embed_dim)
+        self.norm3 = nn.BatchNorm2d(embed_dim)
+        self.norm4 = nn.BatchNorm2d(embed_dim)
 
         self.up.apply(self._init_weights)
         self.spm.apply(self._init_weights)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
index 9367806f..12bf2a21 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .mask2former_head_from_mmseg import Mask2FormerHeadFromMMSeg
 
 __all__ = ['Mask2FormerHeadFromMMSeg']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
index 36660520..ae7a0416 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from abc import ABCMeta, abstractmethod
 
 import torch
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
index ad8b1586..c0681d2b 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 
 import copy
 
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
index 1f2c8b04..18bbce0d 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .encoder_decoder_mask2former import EncoderDecoderMask2Former
 
 __all__ = ['EncoderDecoderMask2Former']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
index 8bd8fa3f..311352c2 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 import warnings
 from abc import ABCMeta, abstractmethod
 from collections import OrderedDict
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
index 9287e8aa..50492374 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
index dec8a5f2..9c4d5c4c 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from .builder import build_pixel_sampler
 from .data_process_func import ResizeToMultiple
 from .seg_func import add_prefix, seg_resize
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
index 63d77fea..0603ef94 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 from mmcv.utils import Registry, build_from_cfg
 
 PIXEL_SAMPLERS = Registry('pixel sampler')
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
index fba46b81..db564cca 100644
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
@@ -1,6 +1,5 @@
-# The implementation refers to the VitAdapter
-# available at
-# https://github.com/czczup/ViT-Adapter.git
+# The implementation is adopted from VitAdapter,
+# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git
 
 import warnings
 
diff --git a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
index 9ffc2b03..b96e709c 100644
--- a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
@@ -4,11 +4,13 @@ from typing import Any, Dict, Union
 import cv2
 import numpy as np
 import PIL
+import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -39,28 +41,24 @@ class ImagePanopticSegmentationPipeline(Pipeline):
         # build the data pipeline
 
         if isinstance(input, str):
-            # input is str, file names, pipeline loadimagefromfile
-            # collect data
-            data = dict(img_info=dict(filename=input), img_prefix=None)
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(load_image(input))
+            img = img[:, :, ::-1]  # convert to bgr
         elif isinstance(input, PIL.Image.Image):
             cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
             img = np.array(input.convert('RGB'))
-            # collect data
-            data = dict(img=img)
         elif isinstance(input, np.ndarray):
             cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
             if len(input.shape) == 2:
                 img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
             else:
                 img = input
-            img = img[:, :, ::-1]  # in rgb order
-            # collect data
-            data = dict(img=img)
-
         else:
             raise TypeError(f'input should be either str, PIL.Image,'
                             f' np.array, but got {type(input)}')
 
+        # collect data
+        data = dict(img=img)
         cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
         test_pipeline = Compose(cfg.data.test.pipeline)
 
diff --git a/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
index e3e1fd6b..023d9712 100644
--- a/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
@@ -10,6 +10,7 @@ from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -40,28 +41,24 @@ class ImageSemanticSegmentationPipeline(Pipeline):
         # build the data pipeline
 
         if isinstance(input, str):
-            # input is str, file names, pipeline loadimagefromfile
-            # collect data
-            data = dict(img_info=dict(filename=input), img_prefix=None)
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(load_image(input))
+            img = img[:, :, ::-1]  # convert to bgr
         elif isinstance(input, PIL.Image.Image):  # BGR
             cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
             img = np.array(input)[:, :, ::-1]
-            # collect data
-            data = dict(img=img)
         elif isinstance(input, np.ndarray):
             cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
             if len(input.shape) == 2:
                 img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
             else:
                 img = input
-            # collect data
-            data = dict(img=img)
-
         else:
             raise TypeError(f'input should be either str, PIL.Image,'
                             f' np.array, but got {type(input)}')
 
-        # data = dict(img=input)
+        # collect data
+        data = dict(img=img)
         cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
         test_pipeline = Compose(cfg.data.test.pipeline)
 
@@ -80,11 +77,9 @@ class ImageSemanticSegmentationPipeline(Pipeline):
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         results = self.model.inference(input)
-
         return results
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-
         results = self.model.postprocess(inputs)
         outputs = {
             OutputKeys.MASKS: results[OutputKeys.MASKS],

From a3598f8d8c09ced380c9393d5c5208ef65aa13dd Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Tue, 27 Sep 2022 23:24:58 +0800
Subject: [PATCH 20/23] [to #42322933] Fix rouge metrics for chinese text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复 TextGenerationMetric 中 Rouge 指标计算中文时结果不正确的问题

为文本生成添加 BLEU 指标
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10254323
---
 modelscope/metrics/builder.py                |  4 ++
 modelscope/metrics/text_generation_metric.py | 62 +++++++++++++++-----
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 800e3508..9e875cc4 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -18,6 +18,10 @@ class MetricKeys(object):
     SSIM = 'ssim'
     AVERAGE_LOSS = 'avg_loss'
     FScore = 'fscore'
+    BLEU_1 = 'bleu-1'
+    BLEU_4 = 'bleu-4'
+    ROUGE_1 = 'rouge-1'
+    ROUGE_L = 'rouge-l'
 
 
 task_default_metrics = {
diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py
index f154281d..90b80425 100644
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -1,11 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Dict
+from typing import Dict, Iterable, List
+
+from nltk.translate.bleu_score import sentence_bleu
+from rouge import Rouge
 
 from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
 from modelscope.utils.registry import default_group
-from .base import Metric
-from .builder import METRICS, MetricKeys
 
 
 @METRICS.register_module(
@@ -17,20 +20,49 @@ class TextGenerationMetric(Metric):
     """
 
     def __init__(self):
-        self.preds = []
-        self.tgts = []
-        from rouge_score import rouge_scorer
-        self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+        self.preds: List[str] = []
+        self.tgts: List[str] = []
+        self.rouge = Rouge()
 
-    def add(self, outputs: Dict, inputs: Dict):
+    @staticmethod
+    def is_chinese_char(char: str):
+        # the length of char must be 1
+        return '\u4e00' <= char <= '\u9fa5'
+
+    # add space for each chinese char
+    def rebuild_str(self, string: str):
+        return ' '.join(''.join([
+            f' {char} ' if self.is_chinese_char(char) else char
+            for char in string
+        ]).split())
+
+    def add(self, outputs: Dict[str, List[str]], inputs: Dict = None):
         ground_truths = outputs['tgts']
         eval_results = outputs['preds']
-        self.preds.extend(eval_results)
-        self.tgts.extend(ground_truths)
+        for truth in ground_truths:
+            self.tgts.append(self.rebuild_str(truth))
+        for result in eval_results:
+            self.preds.append(self.rebuild_str(result))
 
     def evaluate(self):
-        scores = [
-            self.scorer.score(pred, tgt)['rougeL'].fmeasure
-            for pred, tgt in zip(self.preds, self.tgts)
-        ]
-        return {MetricKeys.F1: sum(scores) / len(scores)}
+
+        def mean(iter: Iterable) -> float:
+            return sum(iter) / len(self.preds)
+
+        rouge_scores = self.rouge.get_scores(hyps=self.preds, refs=self.tgts)
+        rouge_1 = mean(map(lambda score: score['rouge-1']['f'], rouge_scores))
+        rouge_l = mean(map(lambda score: score['rouge-l']['f'], rouge_scores))
+        pred_split = tuple(pred.split(' ') for pred in self.preds)
+        tgt_split = tuple(tgt.split(' ') for tgt in self.tgts)
+        bleu_1 = mean(
+            sentence_bleu([tgt], pred, weights=(1, 0, 0, 0))
+            for pred, tgt in zip(pred_split, tgt_split))
+        bleu_4 = mean(
+            sentence_bleu([tgt], pred)
+            for pred, tgt in zip(pred_split, tgt_split))
+        return {
+            MetricKeys.ROUGE_1: rouge_1,
+            MetricKeys.ROUGE_L: rouge_l,
+            MetricKeys.BLEU_1: bleu_1,
+            MetricKeys.BLEU_4: bleu_4
+        }

From 11b33164c33cc3fae3a195037a278c3cb87484a6 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Wed, 28 Sep 2022 09:26:44 +0800
Subject: [PATCH 21/23]  [to #42322933] disable t5 test temporarily

---
 tests/pipelines/test_text2text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
index 04cecf93..a39562f5 100644
--- a/tests/pipelines/test_text2text_generation.py
+++ b/tests/pipelines/test_text2text_generation.py
@@ -30,7 +30,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
             f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}'
         )
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_pipeline_with_model_instance(self):
         model = Model.from_pretrained(self.model_id)
         preprocessor = Text2TextGenerationPreprocessor(model.model_dir)
@@ -40,7 +40,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
             preprocessor=preprocessor)
         print(pipeline_ins(self.input))
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_pipeline_with_model_id(self):
         pipeline_ins = pipeline(
             task=Tasks.text2text_generation, model=self.model_id)

From c51b74c2ea6f2c736955a34599a745b2cd0d02a3 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Wed, 28 Sep 2022 13:36:09 +0800
Subject: [PATCH 22/23] [to #45220645]fix: fix ffmpeg mp4 encoder bug        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10284398

    * [to #45220645]fix: fix ffmpeg mp4 encoder bug
---
 docker/Dockerfile.ubuntu | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index e0bfa908..a9a409b5 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -34,7 +34,8 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${a
     cp /tmp/resources/conda.tuna  ~/.condarc && \
     source /root/.bashrc && \
     conda install --yes python==${PYTHON_VERSION} && \
-    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
+    pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn
 
 ARG USE_GPU=True
 
@@ -42,15 +43,15 @@ ARG USE_GPU=True
 ARG TORCH_VERSION=1.12.0
 ARG CUDATOOLKIT_VERSION=11.3
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        conda install --yes pytorch==$TORCH_VERSION torchvision torchaudio cudatoolkit=$CUDATOOLKIT_VERSION -c pytorch && conda clean --yes --all; \
+        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113; \
     else \
-        conda install pytorch==$TORCH_VERSION torchvision torchaudio cpuonly -c pytorch; \
+        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu; \
     fi
 
 # install tensorflow
 ARG TENSORFLOW_VERSION=1.15.5
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        pip install --no-cache-dir --use-deprecated=legacy-resolver tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
+        pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
     else \
         pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
     fi
@@ -75,9 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
 ENV SHELL=/bin/bash
 
 # install special package
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 ipykernel && \
-    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
-    pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq
 
 RUN if [ "$USE_GPU" = "True" ] ; then \
         pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \

From 0e52a20d2889bca5c0f8165d3013bd46de4afccc Mon Sep 17 00:00:00 2001
From: "chaojie.mcj" <chaojie.mcj@alibaba-inc.com>
Date: Wed, 28 Sep 2022 14:30:37 +0800
Subject: [PATCH 23/23] [to #42322933]update license
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

以下算法进行了header变更：
modelscope.models.cv.cmdssl_video_embedding
modelscope.models.cv.action_recognition
modelscope.models.cv.animal_recognition
modelscope.models.multi_modal.multi_stage_diffusion
modelscope.models.multi_modal.gemm

modelscope.pipelines.cv.live_category_pipeline
modelscope.pipelines.cv.video_category_pipeline
modelscope.models.cv.image_to_image_translation
modelscope.models.cv.image_to_image_generation

modelscope.models.cv.video_inpainting
modelscope.models.multi_modal.diffusion
modelscope.models.multi_modal.team
modelscope.models.cv.shop_segmentation
modelscope.models.cv.text_driven_segmentation
modelscope.models.cv.action_recognition


modelscope.models.cv.face_emotion
modelscope.models.cv.hand_static
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10268474
---
 .../models/cv/action_recognition/models.py      |  3 +++
 modelscope/models/cv/action_recognition/s3dg.py |  3 +++
 .../cv/action_recognition/tada_convnext.py      |  4 ++++
 .../models/cv/animal_recognition/resnet.py      |  3 +++
 .../models/cv/animal_recognition/splat.py       |  3 +++
 .../cv/cmdssl_video_embedding/__init__.py       |  3 ++-
 .../models/cv/cmdssl_video_embedding/c3d.py     |  8 ++++++++
 .../cv/cmdssl_video_embedding/resnet2p1d.py     |  8 ++++++++
 .../cv/cmdssl_video_embedding/resnet3d.py       |  8 ++++++++
 .../models/cv/shop_segmentation/common.py       | 14 ++++++--------
 .../models/cv/shop_segmentation/head_fpn.py     | 14 ++++++--------
 .../models/cv/shop_segmentation/models.py       | 14 ++++++--------
 .../models/cv/shop_segmentation/neck_fpn.py     | 14 ++++++--------
 .../cv/shop_segmentation/shop_seg_base.py       | 14 ++++++--------
 .../cv/shop_segmentation/shop_seg_model.py      |  2 ++
 modelscope/models/cv/shop_segmentation/utils.py |  7 +++----
 .../cv/text_driven_segmentation/__init__.py     |  1 +
 .../models/cv/text_driven_segmentation/clip.py  |  7 +++----
 .../cv/text_driven_segmentation/lseg_base.py    |  6 ++----
 .../cv/text_driven_segmentation/lseg_blocks.py  |  6 ++----
 .../cv/text_driven_segmentation/lseg_model.py   |  2 ++
 .../cv/text_driven_segmentation/lseg_net.py     |  6 ++----
 .../cv/text_driven_segmentation/lseg_vit.py     |  6 ++----
 .../models/cv/text_driven_segmentation/model.py |  6 ++----
 .../simple_tokenizer.py                         |  7 +++----
 .../models/multi_modal/diffusion/diffusion.py   |  3 +++
 .../models/multi_modal/diffusion/model.py       |  1 +
 .../multi_modal/diffusion/unet_generator.py     |  3 +++
 .../diffusion/unet_upsampler_1024.py            |  3 +++
 .../multi_modal/diffusion/unet_upsampler_256.py |  3 +++
 modelscope/models/multi_modal/gemm/gemm_base.py | 17 +++++++++++------
 .../models/multi_modal/gemm/gemm_model.py       |  2 ++
 modelscope/models/multi_modal/gemm/tokenizer.py | 12 ++++++++----
 modelscope/models/multi_modal/mmr/__init__.py   |  2 ++
 .../mmr/dataloaders/rawvideo_util.py            |  3 +++
 .../models/multi_modal/mmr/models/__init__.py   |  2 ++
 .../mmr/models/clip_for_mm_video_embedding.py   |  3 +++
 .../mmr/models/dynamic_inverted_softmax.py      |  3 +++
 .../models/multi_modal/mmr/models/modeling.py   |  2 ++
 .../multi_modal/mmr/models/module_clip.py       |  3 ++-
 .../multi_modal/mmr/models/module_cross.py      |  3 +++
 .../multi_modal/mmr/models/tokenization_clip.py |  3 +++
 .../multi_modal/multi_stage_diffusion/clip.py   |  3 ++-
 .../multi_stage_diffusion/decoder.py            |  2 +-
 .../multi_stage_diffusion/gaussian_diffusion.py |  5 +++--
 .../multi_modal/multi_stage_diffusion/model.py  |  2 +-
 .../multi_modal/multi_stage_diffusion/prior.py  |  2 +-
 .../multi_stage_diffusion/tokenizer.py          |  3 ++-
 .../multi_stage_diffusion/upsampler.py          |  2 +-
 .../multi_modal/multi_stage_diffusion/xglm.py   |  5 +++--
 .../models/multi_modal/team/team_model.py       |  1 +
 modelscope/models/multi_modal/team/utils.py     | 11 +++++++----
 .../pipelines/cv/animal_recognition_pipeline.py |  1 +
 .../cv/cmdssl_video_embedding_pipeline.py       |  2 ++
 .../cv/general_recognition_pipeline.py          |  1 +
 .../pipelines/cv/live_category_pipeline.py      |  2 +-
 .../pipelines/cv/shop_segmentation_pipleline.py |  1 +
 .../cv/text_driven_segmentation_pipleline.py    |  1 +
 .../pipelines/cv/video_category_pipeline.py     |  2 +-
 ...generative_multi_modal_embedding_pipeline.py |  2 +-
 .../team_multi_modal_similarity_pipeline.py     |  3 +--
 tests/pipelines/test_cmdssl_video_embedding.py  |  2 +-
 .../test_generative_multi_modal_embedding.py    |  2 +-
 tests/pipelines/test_multi_modal_similarity.py  |  2 +-
 64 files changed, 188 insertions(+), 106 deletions(-)

diff --git a/modelscope/models/cv/action_recognition/models.py b/modelscope/models/cv/action_recognition/models.py
index a5964e21..f16805fb 100644
--- a/modelscope/models/cv/action_recognition/models.py
+++ b/modelscope/models/cv/action_recognition/models.py
@@ -1,3 +1,6 @@
+# The implementation is also open-sourced by the authors,
+# and available at https://github.com/alibaba-mmai-research/TAdaConv
+# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
 import torch.nn as nn
 
 from .s3dg import Inception3D
diff --git a/modelscope/models/cv/action_recognition/s3dg.py b/modelscope/models/cv/action_recognition/s3dg.py
index f258df16..46e76892 100644
--- a/modelscope/models/cv/action_recognition/s3dg.py
+++ b/modelscope/models/cv/action_recognition/s3dg.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from https://github.com/TengdaHan/CoCLR,
+# made pubicly available under the Apache License, Version 2.0 at https://github.com/TengdaHan/CoCLR
+# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/action_recognition/tada_convnext.py b/modelscope/models/cv/action_recognition/tada_convnext.py
index 379b5271..b1de7af8 100644
--- a/modelscope/models/cv/action_recognition/tada_convnext.py
+++ b/modelscope/models/cv/action_recognition/tada_convnext.py
@@ -1,3 +1,7 @@
+# The implementation is adopted from https://github.com/facebookresearch/ConvNeXt,
+# made pubicly available under the MIT License at https://github.com/facebookresearch/ConvNeXt
+# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
+
 import math
 
 import torch
diff --git a/modelscope/models/cv/animal_recognition/resnet.py b/modelscope/models/cv/animal_recognition/resnet.py
index 73953de4..d7c03c29 100644
--- a/modelscope/models/cv/animal_recognition/resnet.py
+++ b/modelscope/models/cv/animal_recognition/resnet.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Split-Attention Network, A New ResNet Variant,
+# made pubicly available under the Apache License 2.0 License
+# at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/resnet.py
 import math
 
 import torch
diff --git a/modelscope/models/cv/animal_recognition/splat.py b/modelscope/models/cv/animal_recognition/splat.py
index 0aab555e..a10d0abe 100644
--- a/modelscope/models/cv/animal_recognition/splat.py
+++ b/modelscope/models/cv/animal_recognition/splat.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Split-Attention Network, A New ResNet Variant,
+# made pubicly available under the Apache License 2.0 License
+# at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/splat.py
 """Split-Attention"""
 
 import torch
diff --git a/modelscope/models/cv/cmdssl_video_embedding/__init__.py b/modelscope/models/cv/cmdssl_video_embedding/__init__.py
index e7e156a5..5bc67b63 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/__init__.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/__init__.py
@@ -1,4 +1,5 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/models/cv/cmdssl_video_embedding/c3d.py b/modelscope/models/cv/cmdssl_video_embedding/c3d.py
index 62f0e0b9..53dd05a1 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/c3d.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/c3d.py
@@ -1,3 +1,11 @@
+# Copyright 2022 Davide Abati.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on c3d-pytorch,
+# originally MIT License, Copyright (c) 2022 Davide Abati,
+# and publicly available at https://github.com/DavideA/c3d-pytorch
+""" C3D Model Architecture."""
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py b/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py
index 3b03cc74..b49069d1 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py
@@ -1,3 +1,11 @@
+# Copyright (c) 2022 Kensho Hara.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on 3D-ResNets-PyTorch,
+# originally MIT License, Copyright (c) 2022 Kensho Hara,
+# and publicly available at https://github.com/kenshohara/3D-ResNets-PyTorch/blob/master/models/resnet2p1d.py
+""" ResNet2plus1d Model Architecture."""
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py b/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py
index 24d50a8e..dddba06f 100644
--- a/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py
+++ b/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py
@@ -1,3 +1,11 @@
+# Copyright (c) 2022 Kensho Hara.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on 3D-ResNets-PyTorch,
+# originally MIT License, Copyright (c) 2022 Kensho Hara,
+# and publicly available at https://github.com/kenshohara/3D-ResNets-PyTorch/blob/master/models/resnet.py
+""" ResNet3D Model Architecture."""
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/shop_segmentation/common.py b/modelscope/models/cv/shop_segmentation/common.py
index 00ba9996..8cb940a5 100644
--- a/modelscope/models/cv/shop_segmentation/common.py
+++ b/modelscope/models/cv/shop_segmentation/common.py
@@ -1,11 +1,9 @@
-"""
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import warnings
 
diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py
index b3faa9b8..cad389c7 100644
--- a/modelscope/models/cv/shop_segmentation/head_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/head_fpn.py
@@ -1,11 +1,9 @@
-""" FPNHead
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
index 171aafbd..3880d074 100644
--- a/modelscope/models/cv/shop_segmentation/models.py
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -1,11 +1,9 @@
-"""
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import math
 from collections import OrderedDict
diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py
index 108cb043..aa4d7159 100644
--- a/modelscope/models/cv/shop_segmentation/neck_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py
@@ -1,11 +1,9 @@
-""" FPNneck
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_base.py b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
index e3ae0d54..34686370 100644
--- a/modelscope/models/cv/shop_segmentation/shop_seg_base.py
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
@@ -1,11 +1,9 @@
-"""
-Base modules are adapted from https://github.com/open-mmlab/mmcv/,
-originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
-https://github.com/open-mmlab/mmsegmentation/,
-originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
-and adapted from https://github.com/raoyongming/DenseCLIP/,
-originally MIT License, Copyright (c) 2022 Rao, Yongming.
-"""
+# Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+# https://github.com/open-mmlab/mmsegmentation/,
+# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+# and adapted from https://github.com/raoyongming/DenseCLIP/,
+# originally MIT License, Copyright (c) 2022 Rao, Yongming.
 
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_model.py b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
index 0aeeb1de..ac0d67fa 100644
--- a/modelscope/models/cv/shop_segmentation/shop_seg_model.py
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/cv/shop_segmentation/utils.py b/modelscope/models/cv/shop_segmentation/utils.py
index c41f8a65..4035b0ef 100644
--- a/modelscope/models/cv/shop_segmentation/utils.py
+++ b/modelscope/models/cv/shop_segmentation/utils.py
@@ -1,7 +1,6 @@
-""" CLIP Tokenizer
-Adapted from https://github.com/openai/CLIP.
-Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
+# CLIP Tokenizer
+# Adapted from https://github.com/openai/CLIP.
+# Originally MIT License, Copyright (c) 2021 OpenAI.
 
 import gzip
 import html
diff --git a/modelscope/models/cv/text_driven_segmentation/__init__.py b/modelscope/models/cv/text_driven_segmentation/__init__.py
index 46daad78..aefaa698 100644
--- a/modelscope/models/cv/text_driven_segmentation/__init__.py
+++ b/modelscope/models/cv/text_driven_segmentation/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .lseg_base import TextDrivenSegmentation
diff --git a/modelscope/models/cv/text_driven_segmentation/clip.py b/modelscope/models/cv/text_driven_segmentation/clip.py
index 440cccea..1cec5f39 100644
--- a/modelscope/models/cv/text_driven_segmentation/clip.py
+++ b/modelscope/models/cv/text_driven_segmentation/clip.py
@@ -1,7 +1,6 @@
-""" CLIP
-Adapted from https://github.com/openai/CLIP.
-Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
+#  CLIP
+# Adapted from https://github.com/openai/CLIP.
+# Originally MIT License, Copyright (c) 2021 OpenAI.
 
 import hashlib
 import os
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_base.py b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
index 20915396..c79861a7 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_base.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
index cb550ab7..56d4a65d 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_model.py b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
index 1d7ebdd1..9a5754c6 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_model.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_net.py b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
index 1a558c5c..541a4a38 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_net.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
index be2813c2..5298832f 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 import math
 import types
diff --git a/modelscope/models/cv/text_driven_segmentation/model.py b/modelscope/models/cv/text_driven_segmentation/model.py
index ece10bab..f98d480d 100644
--- a/modelscope/models/cv/text_driven_segmentation/model.py
+++ b/modelscope/models/cv/text_driven_segmentation/model.py
@@ -1,7 +1,5 @@
-"""
-Adapted from https://github.com/isl-org/lang-seg.
-Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
-"""
+# Adapted from https://github.com/isl-org/lang-seg.
+# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 
 from collections import OrderedDict
 from typing import Tuple, Union
diff --git a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
index 250d680f..361d67c6 100644
--- a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
+++ b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
@@ -1,7 +1,6 @@
-""" CLIP
-Adapted from https://github.com/openai/CLIP.
-Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
+# CLIP
+# Adapted from https://github.com/openai/CLIP.
+# Originally MIT License, Copyright (c) 2021 OpenAI.
 
 import gzip
 import html
diff --git a/modelscope/models/multi_modal/diffusion/diffusion.py b/modelscope/models/multi_modal/diffusion/diffusion.py
index d71fe0ae..bfe7baf7 100644
--- a/modelscope/models/multi_modal/diffusion/diffusion.py
+++ b/modelscope/models/multi_modal/diffusion/diffusion.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/multi_modal/diffusion/model.py b/modelscope/models/multi_modal/diffusion/model.py
index 8617b8dd..4229391f 100644
--- a/modelscope/models/multi_modal/diffusion/model.py
+++ b/modelscope/models/multi_modal/diffusion/model.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/multi_modal/diffusion/unet_generator.py b/modelscope/models/multi_modal/diffusion/unet_generator.py
index 9b507223..539d3996 100644
--- a/modelscope/models/multi_modal/diffusion/unet_generator.py
+++ b/modelscope/models/multi_modal/diffusion/unet_generator.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
index 1c66b2fe..38cff6a2 100644
--- a/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
+++ b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 
 import torch
diff --git a/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
index 0da8b805..ca5cd7d6 100644
--- a/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
+++ b/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 from functools import partial
 
diff --git a/modelscope/models/multi_modal/gemm/gemm_base.py b/modelscope/models/multi_modal/gemm/gemm_base.py
index db928212..09ef2480 100644
--- a/modelscope/models/multi_modal/gemm/gemm_base.py
+++ b/modelscope/models/multi_modal/gemm/gemm_base.py
@@ -1,9 +1,14 @@
-""" Generative Multimodal Model
-Base modules are adapted from https://github.com/openai/CLIP/,
-originally MIT License, Copyright (c) 2021 OpenAI,
-and adapted from https://github.com/lucidrains/CoCa-pytorch/,
-originally MIT License, Copyright (c) 2022 Phil Wang.
-"""
+# Copyright 2021 The OpenAI Team Authors.
+# Copyright 2022 Phil Wang.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+#
+# The implementation here is modified based on OpenAI CLIP,
+# originally MIT License, Copyright (c) 2021 OpenAI,
+# and publicly available at https://github.com/openai/CLIP/.
+# The implementation here is modified based on Coca-pytorch,
+# originally MIT License, Copyright (c) 2022 Phil Wang,
+# and publicly available at https://github.com/lucidrains/CoCa-pytorch/,
+""" Generative Multimodal Model Architecture."""
 
 import os
 from collections import OrderedDict
diff --git a/modelscope/models/multi_modal/gemm/gemm_model.py b/modelscope/models/multi_modal/gemm/gemm_model.py
index 356dc8d3..55b211c0 100644
--- a/modelscope/models/multi_modal/gemm/gemm_model.py
+++ b/modelscope/models/multi_modal/gemm/gemm_model.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+""" Generative Multimodal Model Wrapper."""
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/multi_modal/gemm/tokenizer.py b/modelscope/models/multi_modal/gemm/tokenizer.py
index af962ceb..8b7cc094 100644
--- a/modelscope/models/multi_modal/gemm/tokenizer.py
+++ b/modelscope/models/multi_modal/gemm/tokenizer.py
@@ -1,7 +1,11 @@
-""" CLIP Tokenizer
-Adapted from https://github.com/openai/CLIP.
-Originally MIT License, Copyright (c) 2021 OpenAI.
-"""
+# Copyright 2021 The OpenAI Team Authors.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+#
+# The implementation here is modified based on OpenAI CLIP,
+# originally MIT License, Copyright (c) 2021 OpenAI,
+# and publicly available at https://github.com/openai/CLIP/.
+""" CLIP Tokenizer."""
+
 import gzip
 import html
 import os
diff --git a/modelscope/models/multi_modal/mmr/__init__.py b/modelscope/models/multi_modal/mmr/__init__.py
index c5fb7419..9dac8409 100644
--- a/modelscope/models/multi_modal/mmr/__init__.py
+++ b/modelscope/models/multi_modal/mmr/__init__.py
@@ -1 +1,3 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from .models import VideoCLIPForMultiModalEmbedding
diff --git a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
index eab1189f..c7ac3f94 100644
--- a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
+++ b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Huaishao Luo,
+# made pubicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip
+
 import cv2
 import numpy as np
 import torch as th
diff --git a/modelscope/models/multi_modal/mmr/models/__init__.py b/modelscope/models/multi_modal/mmr/models/__init__.py
index 6cd06bcd..da832719 100644
--- a/modelscope/models/multi_modal/mmr/models/__init__.py
+++ b/modelscope/models/multi_modal/mmr/models/__init__.py
@@ -1 +1,3 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 from .clip_for_mm_video_embedding import VideoCLIPForMultiModalEmbedding
diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 8d13e745..5e8e2e7a 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -1,3 +1,6 @@
+# The implementation is adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 import random
 from os.path import exists
 from typing import Any, Dict
diff --git a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
index 572f44bc..253a847c 100644
--- a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
+++ b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
@@ -1,3 +1,6 @@
+# The implementation is adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 import numpy as np
 
 
diff --git a/modelscope/models/multi_modal/mmr/models/modeling.py b/modelscope/models/multi_modal/mmr/models/modeling.py
index 21cc4c80..dc6510bf 100644
--- a/modelscope/models/multi_modal/mmr/models/modeling.py
+++ b/modelscope/models/multi_modal/mmr/models/modeling.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 import os
 import platform
 from collections import OrderedDict
diff --git a/modelscope/models/multi_modal/mmr/models/module_clip.py b/modelscope/models/multi_modal/mmr/models/module_clip.py
index 36e56196..53501720 100644
--- a/modelscope/models/multi_modal/mmr/models/module_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/module_clip.py
@@ -1,4 +1,5 @@
-# Part of the implementation is borrowed and modified from The OpenAI CLIP project.
+# The implementation is  adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import hashlib
 import os
diff --git a/modelscope/models/multi_modal/mmr/models/module_cross.py b/modelscope/models/multi_modal/mmr/models/module_cross.py
index 05edb853..b958d5bc 100644
--- a/modelscope/models/multi_modal/mmr/models/module_cross.py
+++ b/modelscope/models/multi_modal/mmr/models/module_cross.py
@@ -1,3 +1,6 @@
+# The implementation is  adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 from __future__ import absolute_import, division, print_function
 import logging
 from collections import OrderedDict
diff --git a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
index ee60f857..4e2c9b15 100644
--- a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
@@ -1,3 +1,6 @@
+# The implementation is adopated from the CLIP4Clip implementation,
+# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+
 import gzip
 import html
 import os
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/clip.py b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
index 54e971f7..98727066 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py
@@ -1,4 +1,5 @@
-# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP.
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
index 17daedaf..eb52a48b 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
index a4fc52e0..9677d7c4 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py
@@ -1,5 +1,6 @@
-# The implementation here is modified based on latent diffusion, publicly available
-# at https://github.com/CompVis/latent-diffusion.
+# Part of the implementation is borrowed and modified from latent-diffusion,
+# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/model.py b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
index c2d83b34..59bd837d 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/model.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 import os.path as osp
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/prior.py b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
index 380fa467..9f4ef2d5 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
index 6fd9bebe..59d6b304 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py
@@ -1,4 +1,5 @@
-# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP.
+# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import gzip
 import html
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
index 4e99a514..a292edae 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
index 8a0b3ff1..133da50b 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py
@@ -1,5 +1,6 @@
-# The implementation here is modified based on HuggingFace XGLM, publicly available
-# at https://github.com/huggingface/transformers.
+# Part of the implementation is borrowed and modified from HuggingFace XGLM,
+# publicly avaialbe at https://github.com/huggingface/transformers.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
 
diff --git a/modelscope/models/multi_modal/team/team_model.py b/modelscope/models/multi_modal/team/team_model.py
index 4aa77e17..8c0e288a 100644
--- a/modelscope/models/multi_modal/team/team_model.py
+++ b/modelscope/models/multi_modal/team/team_model.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import Any, Dict
 
 import cv2
diff --git a/modelscope/models/multi_modal/team/utils.py b/modelscope/models/multi_modal/team/utils.py
index 3b3e394e..73919179 100644
--- a/modelscope/models/multi_modal/team/utils.py
+++ b/modelscope/models/multi_modal/team/utils.py
@@ -1,7 +1,10 @@
-""" Generative Multimodal Model
-Base Transformer code is adapted from https://github.com/openai/CLIP/,
-originally MIT License, Copyright (c) 2021 OpenAI,
-"""
+# Copyright 2021 The OpenAI Team Authors.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+#
+# The implementation here is modified based on OpenAI CLIP,
+# originally MIT License, Copyright (c) 2021 OpenAI,
+# and publicly available at https://github.com/openai/CLIP/.
+
 from collections import OrderedDict
 from typing import Tuple, Union
 
diff --git a/modelscope/pipelines/cv/animal_recognition_pipeline.py b/modelscope/pipelines/cv/animal_recognition_pipeline.py
index 18cba92c..fad14680 100644
--- a/modelscope/pipelines/cv/animal_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision  Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py b/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py
index 9f4e2d93..deb17561 100644
--- a/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py
+++ b/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/general_recognition_pipeline.py b/modelscope/pipelines/cv/general_recognition_pipeline.py
index 9ba5117b..07222086 100644
--- a/modelscope/pipelines/cv/general_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/general_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision  Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/live_category_pipeline.py b/modelscope/pipelines/cv/live_category_pipeline.py
index c16ba6ba..715998cc 100644
--- a/modelscope/pipelines/cv/live_category_pipeline.py
+++ b/modelscope/pipelines/cv/live_category_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/shop_segmentation_pipleline.py b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
index b7fd90b4..d08058c3 100644
--- a/modelscope/pipelines/cv/shop_segmentation_pipleline.py
+++ b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
index 0985b835..c7f9d4c2 100644
--- a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
+++ b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/modelscope/pipelines/cv/video_category_pipeline.py b/modelscope/pipelines/cv/video_category_pipeline.py
index 196d3115..e4c73649 100644
--- a/modelscope/pipelines/cv/video_category_pipeline.py
+++ b/modelscope/pipelines/cv/video_category_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
index d3b9fef3..13032314 100644
--- a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
index fc123e2f..cafd6555 100644
--- a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
+++ b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
@@ -1,5 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import Any, Dict
 
 from modelscope.metainfo import Pipelines
diff --git a/tests/pipelines/test_cmdssl_video_embedding.py b/tests/pipelines/test_cmdssl_video_embedding.py
index 68eae385..5807c075 100644
--- a/tests/pipelines/test_cmdssl_video_embedding.py
+++ b/tests/pipelines/test_cmdssl_video_embedding.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 # !/usr/bin/env python
 import unittest
 
diff --git a/tests/pipelines/test_generative_multi_modal_embedding.py b/tests/pipelines/test_generative_multi_modal_embedding.py
index 9232ebd4..7061d736 100644
--- a/tests/pipelines/test_generative_multi_modal_embedding.py
+++ b/tests/pipelines/test_generative_multi_modal_embedding.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import unittest
 
diff --git a/tests/pipelines/test_multi_modal_similarity.py b/tests/pipelines/test_multi_modal_similarity.py
index 192602b4..a54fbcf0 100644
--- a/tests/pipelines/test_multi_modal_similarity.py
+++ b/tests/pipelines/test_multi_modal_similarity.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import unittest