[to #42322933] tts sambert am changs from tensorfow to PyTorch and add licenses

* [to #41669377] docs and tools refinement and release 1. add build_doc linter script 2. add sphinx-docs support 3. add development doc and api doc 4. change version to 0.1.0 for the first internal release version Link: https://code.aone.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8775307
2025-12-16 16:27:45 +01:00 · 2022-09-27 22:09:30 +08:00
parent 26df8f1988
commit e90ff9e479
43 changed files with 2799 additions and 5948 deletions
--- a/modelscope/models/audio/tts/models/init.py
+++ b/modelscope/models/audio/tts/models/init.py
@@ -1,9 +0,0 @@
-from .robutrans import RobuTrans
-from .vocoder_models import Generator
-
-
-def create_am_model(name, hparams):
-    if name == 'robutrans':
-        return RobuTrans(hparams)
-    else:
-        raise Exception('Unknown model: ' + name)
--- a/modelscope/models/audio/tts/models/am_models.py
+++ b/modelscope/models/audio/tts/models/am_models.py
@@ -1,460 +0,0 @@
-import tensorflow as tf
-
-
-def encoder_prenet(inputs,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   dense_units,
-                   is_training,
-                   mask=None,
-                   scope='encoder_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-        x = tf.layers.dense(
-            x, units=dense_units, activation=None, name='dense')
-    return x
-
-
-def decoder_prenet(inputs,
-                   prenet_units,
-                   dense_units,
-                   is_training,
-                   scope='decoder_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i, units in enumerate(prenet_units):
-            x = tf.layers.dense(
-                x,
-                units=units,
-                activation=tf.nn.relu,
-                name='dense_{}'.format(i))
-            x = tf.layers.dropout(
-                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
-        x = tf.layers.dense(
-            x, units=dense_units, activation=None, name='dense')
-    return x
-
-
-def encoder(inputs,
-            input_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker,
-            mask=None,
-            scope='encoder'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm(
-            inputs,
-            input_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker,
-            mask=mask)
-    return x
-
-
-def prenet(inputs, prenet_units, is_training, scope='prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i, units in enumerate(prenet_units):
-            x = tf.layers.dense(
-                x,
-                units=units,
-                activation=tf.nn.relu,
-                name='dense_{}'.format(i))
-            x = tf.layers.dropout(
-                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
-    return x
-
-
-def postnet_residual_ulstm(inputs,
-                           n_conv_layers,
-                           filters,
-                           kernel_size,
-                           lstm_units,
-                           output_units,
-                           is_training,
-                           scope='postnet_residual_ulstm'):
-    with tf.variable_scope(scope):
-        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
-                           lstm_units, is_training)
-        x = conv1d(
-            x,
-            output_units,
-            kernel_size,
-            is_training,
-            activation=None,
-            dropout=False,
-            scope='conv1d_{}'.format(n_conv_layers - 1))
-    return x
-
-
-def postnet_residual_lstm(inputs,
-                          n_conv_layers,
-                          filters,
-                          kernel_size,
-                          lstm_units,
-                          output_units,
-                          is_training,
-                          scope='postnet_residual_lstm'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size,
-                          lstm_units, is_training)
-        x = conv1d(
-            x,
-            output_units,
-            kernel_size,
-            is_training,
-            activation=None,
-            dropout=False,
-            scope='conv1d_{}'.format(n_conv_layers - 1))
-    return x
-
-
-def postnet_linear_ulstm(inputs,
-                         n_conv_layers,
-                         filters,
-                         kernel_size,
-                         lstm_units,
-                         output_units,
-                         is_training,
-                         scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
-                           lstm_units, is_training)
-        x = tf.layers.dense(x, units=output_units)
-    return x
-
-
-def postnet_linear_lstm(inputs,
-                        n_conv_layers,
-                        filters,
-                        kernel_size,
-                        lstm_units,
-                        output_units,
-                        output_lengths,
-                        is_training,
-                        embedded_inputs_speaker2,
-                        mask=None,
-                        scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_and_lstm_dec(
-            inputs,
-            output_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker2,
-            mask=mask)
-        x = tf.layers.dense(x, units=output_units)
-    return x
-
-
-def postnet_linear(inputs,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   lstm_units,
-                   output_units,
-                   output_lengths,
-                   is_training,
-                   embedded_inputs_speaker2,
-                   mask=None,
-                   scope='postnet_linear'):
-    with tf.variable_scope(scope):
-        x = conv_dec(
-            inputs,
-            output_lengths,
-            n_conv_layers,
-            filters,
-            kernel_size,
-            lstm_units,
-            is_training,
-            embedded_inputs_speaker2,
-            mask=mask)
-    return x
-
-
-def conv_and_lstm(inputs,
-                  sequence_lengths,
-                  n_conv_layers,
-                  filters,
-                  kernel_size,
-                  lstm_units,
-                  is_training,
-                  embedded_inputs_speaker,
-                  mask=None,
-                  scope='conv_and_lstm'):
-    from tensorflow.contrib.rnn import LSTMBlockCell
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-
-    return x
-
-
-def conv_and_lstm_dec(inputs,
-                      sequence_lengths,
-                      n_conv_layers,
-                      filters,
-                      kernel_size,
-                      lstm_units,
-                      is_training,
-                      embedded_inputs_speaker2,
-                      mask=None,
-                      scope='conv_and_lstm'):
-    x = inputs
-    from tensorflow.contrib.rnn import LSTMBlockCell
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker2], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-    return x
-
-
-def conv_dec(inputs,
-             sequence_lengths,
-             n_conv_layers,
-             filters,
-             kernel_size,
-             lstm_units,
-             is_training,
-             embedded_inputs_speaker2,
-             mask=None,
-             scope='conv_and_lstm'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-        x = tf.concat([x, embedded_inputs_speaker2], axis=2)
-    return x
-
-
-def conv_and_ulstm(inputs,
-                   sequence_lengths,
-                   n_conv_layers,
-                   filters,
-                   kernel_size,
-                   lstm_units,
-                   is_training,
-                   scope='conv_and_ulstm'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                scope='conv1d_{}'.format(i))
-
-        outputs, states = tf.nn.dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=sequence_lengths,
-            dtype=tf.float32)
-
-    return outputs
-
-
-def conv1d(inputs,
-           filters,
-           kernel_size,
-           is_training,
-           activation=None,
-           dropout=False,
-           mask=None,
-           scope='conv1d'):
-    with tf.variable_scope(scope):
-        if mask is not None:
-            inputs = inputs * tf.expand_dims(mask, -1)
-        x = tf.layers.conv1d(
-            inputs, filters=filters, kernel_size=kernel_size, padding='same')
-        if mask is not None:
-            x = x * tf.expand_dims(mask, -1)
-
-        x = tf.layers.batch_normalization(x, training=is_training)
-        if activation is not None:
-            x = activation(x)
-        if dropout:
-            x = tf.layers.dropout(x, rate=0.5, training=is_training)
-    return x
-
-
-def conv1d_dp(inputs,
-              filters,
-              kernel_size,
-              is_training,
-              activation=None,
-              dropout=False,
-              dropoutrate=0.5,
-              mask=None,
-              scope='conv1d'):
-    with tf.variable_scope(scope):
-        if mask is not None:
-            inputs = inputs * tf.expand_dims(mask, -1)
-        x = tf.layers.conv1d(
-            inputs, filters=filters, kernel_size=kernel_size, padding='same')
-        if mask is not None:
-            x = x * tf.expand_dims(mask, -1)
-
-        x = tf.contrib.layers.layer_norm(x)
-        if activation is not None:
-            x = activation(x)
-        if dropout:
-            x = tf.layers.dropout(x, rate=dropoutrate, training=is_training)
-    return x
-
-
-def duration_predictor(inputs,
-                       n_conv_layers,
-                       filters,
-                       kernel_size,
-                       lstm_units,
-                       input_lengths,
-                       is_training,
-                       embedded_inputs_speaker,
-                       mask=None,
-                       scope='duration_predictor'):
-    with tf.variable_scope(scope):
-        x = inputs
-        for i in range(n_conv_layers):
-            x = conv1d_dp(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                dropoutrate=0.1,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.concat([x, embedded_inputs_speaker], axis=2)
-
-        outputs, states = tf.nn.bidirectional_dynamic_rnn(
-            LSTMBlockCell(lstm_units),
-            LSTMBlockCell(lstm_units),
-            x,
-            sequence_length=input_lengths,
-            dtype=tf.float32)
-        x = tf.concat(outputs, axis=-1)
-
-        x = tf.layers.dense(x, units=1)
-        x = tf.nn.relu(x)
-    return x
-
-
-def duration_predictor2(inputs,
-                        n_conv_layers,
-                        filters,
-                        kernel_size,
-                        input_lengths,
-                        is_training,
-                        mask=None,
-                        scope='duration_predictor'):
-    with tf.variable_scope(scope):
-        x = inputs
-        for i in range(n_conv_layers):
-            x = conv1d_dp(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                dropoutrate=0.1,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-        x = tf.layers.dense(x, units=1)
-        x = tf.nn.relu(x)
-    return x
-
-
-def conv_prenet(inputs,
-                n_conv_layers,
-                filters,
-                kernel_size,
-                is_training,
-                mask=None,
-                scope='conv_prenet'):
-    x = inputs
-    with tf.variable_scope(scope):
-        for i in range(n_conv_layers):
-            x = conv1d(
-                x,
-                filters,
-                kernel_size,
-                is_training,
-                activation=tf.nn.relu,
-                dropout=True,
-                mask=mask,
-                scope='conv1d_{}'.format(i))
-
-    return x
--- a/modelscope/models/audio/tts/models/compat.py
+++ b/modelscope/models/audio/tts/models/compat.py
@@ -1,82 +0,0 @@
-"""Functions for compatibility with different TensorFlow versions."""
-
-import tensorflow as tf
-
-
-def is_tf2():
-    """Returns ``True`` if running TensorFlow 2.0."""
-    return tf.__version__.startswith('2')
-
-
-def tf_supports(symbol):
-    """Returns ``True`` if TensorFlow defines :obj:`symbol`."""
-    return _string_to_tf_symbol(symbol) is not None
-
-
-def tf_any(*symbols):
-    """Returns the first supported symbol."""
-    for symbol in symbols:
-        module = _string_to_tf_symbol(symbol)
-        if module is not None:
-            return module
-    return None
-
-
-def tf_compat(v2=None, v1=None):  # pylint: disable=invalid-name
-    """Returns the compatible symbol based on the current TensorFlow version.
-
-    Args:
-      v2: The candidate v2 symbol name.
-      v1: The candidate v1 symbol name.
-
-    Returns:
-      A TensorFlow symbol.
-
-    Raises:
-      ValueError: if no symbol can be found.
-    """
-    candidates = []
-    if v2 is not None:
-        candidates.append(v2)
-    if v1 is not None:
-        candidates.append(v1)
-        candidates.append('compat.v1.%s' % v1)
-    symbol = tf_any(*candidates)
-    if symbol is None:
-        raise ValueError('Failure to resolve the TensorFlow symbol')
-    return symbol
-
-
-def name_from_variable_scope(name=''):
-    """Creates a name prefixed by the current variable scope."""
-    var_scope = tf_compat(v1='get_variable_scope')().name
-    compat_name = ''
-    if name:
-        compat_name = '%s/' % name
-    if var_scope:
-        compat_name = '%s/%s' % (var_scope, compat_name)
-    return compat_name
-
-
-def reuse():
-    """Returns ``True`` if the current variable scope is marked for reuse."""
-    return tf_compat(v1='get_variable_scope')().reuse
-
-
-def _string_to_tf_symbol(symbol):
-    modules = symbol.split('.')
-    namespace = tf
-    for module in modules:
-        namespace = getattr(namespace, module, None)
-        if namespace is None:
-            return None
-    return namespace
-
-
-# pylint: disable=invalid-name
-gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy')
-gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists')
-gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile')
-is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor')
-logging = tf_compat(v1='logging')
-nest = tf_compat(v2='nest', v1='contrib.framework.nest')
--- a/modelscope/models/audio/tts/models/datasets/init.py
+++ b/modelscope/models/audio/tts/models/datasets/init.py
--- a/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
+++ b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py
@@ -0,0 +1,238 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+
+import json
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from modelscope.utils.logger import get_logger
+from .units import KanTtsLinguisticUnit
+
+logger = get_logger()
+
+
+class KanTtsText2MelDataset(Dataset):
+
+    def __init__(self, metadata_filename, config_filename, cache=False):
+        super(KanTtsText2MelDataset, self).__init__()
+
+        self.cache = cache
+
+        with open(config_filename) as f:
+            self._config = json.loads(f.read())
+
+        # Load metadata:
+        self._datadir = os.path.dirname(metadata_filename)
+        with open(metadata_filename, encoding='utf-8') as f:
+            self._metadata = [line.strip().split('|') for line in f]
+            self._length_lst = [int(x[2]) for x in self._metadata]
+            hours = sum(
+                self._length_lst) * self._config['audio']['frame_shift_ms'] / (
+                    3600 * 1000)
+
+            logger.info('Loaded metadata for %d examples (%.2f hours)' %
+                        (len(self._metadata), hours))
+            logger.info('Minimum length: %d, Maximum length: %d' %
+                        (min(self._length_lst), max(self._length_lst)))
+
+        self.ling_unit = KanTtsLinguisticUnit(config_filename)
+        self.pad_executor = KanTtsText2MelPad()
+
+        self.r = self._config['am']['outputs_per_step']
+        self.num_mels = self._config['am']['num_mels']
+
+        if 'adv' in self._config:
+            self.feat_window = self._config['adv']['random_window']
+        else:
+            self.feat_window = None
+        logger.info(self.feat_window)
+
+        self.data_cache = [
+            self.cache_load(i) for i in tqdm(range(self.__len__()))
+        ] if self.cache else []
+
+    def get_frames_lst(self):
+        return self._length_lst
+
+    def __getitem__(self, index):
+        if self.cache:
+            sample = self.data_cache[index]
+            return sample
+
+        return self.cache_load(index)
+
+    def cache_load(self, index):
+        sample = {}
+
+        meta = self._metadata[index]
+
+        sample['utt_id'] = meta[0]
+
+        sample['mel_target'] = np.load(os.path.join(
+            self._datadir, meta[1]))[:, :self.num_mels]
+        sample['output_length'] = len(sample['mel_target'])
+
+        lfeat_symbol = meta[3]
+        sample['ling'] = self.ling_unit.encode_symbol_sequence(lfeat_symbol)
+
+        sample['duration'] = np.load(os.path.join(self._datadir, meta[4]))
+
+        sample['pitch_contour'] = np.load(os.path.join(self._datadir, meta[5]))
+
+        sample['energy_contour'] = np.load(
+            os.path.join(self._datadir, meta[6]))
+
+        return sample
+
+    def __len__(self):
+        return len(self._metadata)
+
+    def collate_fn(self, batch):
+        data_dict = {}
+
+        max_input_length = max((len(x['ling'][0]) for x in batch))
+
+        # pure linguistic info: sy|tone|syllable_flag|word_segment
+
+        # sy
+        lfeat_type = self.ling_unit._lfeat_type_list[0]
+        inputs_sy = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][0] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+        # tone
+        lfeat_type = self.ling_unit._lfeat_type_list[1]
+        inputs_tone = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][1] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # syllable_flag
+        lfeat_type = self.ling_unit._lfeat_type_list[2]
+        inputs_syllable_flag = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][2] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # word_segment
+        lfeat_type = self.ling_unit._lfeat_type_list[3]
+        inputs_ws = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][3] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # emotion category
+        lfeat_type = self.ling_unit._lfeat_type_list[4]
+        data_dict['input_emotions'] = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][4] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        # speaker category
+        lfeat_type = self.ling_unit._lfeat_type_list[5]
+        data_dict['input_speakers'] = self.pad_executor._prepare_scalar_inputs(
+            [x['ling'][5] for x in batch], max_input_length,
+            self.ling_unit._sub_unit_pad[lfeat_type]).long()
+
+        data_dict['input_lings'] = torch.stack(
+            [inputs_sy, inputs_tone, inputs_syllable_flag, inputs_ws], dim=2)
+
+        data_dict['valid_input_lengths'] = torch.as_tensor(
+            [len(x['ling'][0]) - 1 for x in batch], dtype=torch.long
+        )  # There is one '~' in the last of symbol sequence. We put length-1 for calculation.
+
+        data_dict['valid_output_lengths'] = torch.as_tensor(
+            [x['output_length'] for x in batch], dtype=torch.long)
+        max_output_length = torch.max(data_dict['valid_output_lengths']).item()
+        max_output_round_length = self.pad_executor._round_up(
+            max_output_length, self.r)
+
+        if self.feat_window is not None:
+            active_feat_len = np.minimum(max_output_round_length,
+                                         self.feat_window)
+            if active_feat_len < self.feat_window:
+                max_output_round_length = self.pad_executor._round_up(
+                    self.feat_window, self.r)
+                active_feat_len = self.feat_window
+
+            max_offsets = [x['output_length'] - active_feat_len for x in batch]
+            feat_offsets = [
+                np.random.randint(0, np.maximum(1, offset))
+                for offset in max_offsets
+            ]
+            feat_offsets = torch.from_numpy(
+                np.asarray(feat_offsets, dtype=np.int32)).long()
+            data_dict['feat_offsets'] = feat_offsets
+
+        data_dict['mel_targets'] = self.pad_executor._prepare_targets(
+            [x['mel_target'] for x in batch], max_output_round_length, 0.0)
+        data_dict['durations'] = self.pad_executor._prepare_durations(
+            [x['duration'] for x in batch], max_input_length,
+            max_output_round_length)
+
+        data_dict['pitch_contours'] = self.pad_executor._prepare_scalar_inputs(
+            [x['pitch_contour'] for x in batch], max_input_length,
+            0.0).float()
+        data_dict[
+            'energy_contours'] = self.pad_executor._prepare_scalar_inputs(
+                [x['energy_contour'] for x in batch], max_input_length,
+                0.0).float()
+
+        data_dict['utt_ids'] = [x['utt_id'] for x in batch]
+
+        return data_dict
+
+
+class KanTtsText2MelPad(object):
+
+    def __init__(self):
+        super(KanTtsText2MelPad, self).__init__()
+        pass
+
+    def _pad1D(self, x, length, pad):
+        return np.pad(
+            x, (0, length - x.shape[0]), mode='constant', constant_values=pad)
+
+    def _pad2D(self, x, length, pad):
+        return np.pad(
+            x, [(0, length - x.shape[0]), (0, 0)],
+            mode='constant',
+            constant_values=pad)
+
+    def _pad_durations(self, duration, max_in_len, max_out_len):
+        framenum = np.sum(duration)
+        symbolnum = duration.shape[0]
+        if framenum < max_out_len:
+            padframenum = max_out_len - framenum
+            duration = np.insert(
+                duration, symbolnum, values=padframenum, axis=0)
+            duration = np.insert(
+                duration,
+                symbolnum + 1,
+                values=[0] * (max_in_len - symbolnum - 1),
+                axis=0)
+        else:
+            if symbolnum < max_in_len:
+                duration = np.insert(
+                    duration,
+                    symbolnum,
+                    values=[0] * (max_in_len - symbolnum),
+                    axis=0)
+        return duration
+
+    def _round_up(self, x, multiple):
+        remainder = x % multiple
+        return x if remainder == 0 else x + multiple - remainder
+
+    def _prepare_scalar_inputs(self, inputs, max_len, pad):
+        return torch.from_numpy(
+            np.stack([self._pad1D(x, max_len, pad) for x in inputs]))
+
+    def _prepare_targets(self, targets, max_len, pad):
+        return torch.from_numpy(
+            np.stack([self._pad2D(t, max_len, pad) for t in targets])).float()
+
+    def _prepare_durations(self, durations, max_in_len, max_out_len):
+        return torch.from_numpy(
+            np.stack([
+                self._pad_durations(t, max_in_len, max_out_len)
+                for t in durations
+            ])).long()
--- a/modelscope/models/audio/tts/models/datasets/samplers.py
+++ b/modelscope/models/audio/tts/models/datasets/samplers.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import random
+
+import torch
+from torch import distributed as dist
+from torch.utils.data import Sampler
+
+
+class LenSortGroupPoolSampler(Sampler):
+
+    def __init__(self, data_source, length_lst, group_size):
+        super(LenSortGroupPoolSampler, self).__init__(data_source)
+
+        self.data_source = data_source
+        self.length_lst = length_lst
+        self.group_size = group_size
+
+        self.num = len(self.length_lst)
+        self.buckets = self.num // group_size
+
+    def __iter__(self):
+
+        def getkey(item):
+            return item[1]
+
+        random_lst = torch.randperm(self.num).tolist()
+        random_len_lst = [(i, self.length_lst[i]) for i in random_lst]
+
+        # Bucket examples based on similar output sequence length for efficiency:
+        groups = [
+            random_len_lst[i:i + self.group_size]
+            for i in range(0, self.num, self.group_size)
+        ]
+        if (self.num % self.group_size):
+            groups.append(random_len_lst[self.buckets * self.group_size:-1])
+
+        indices = []
+
+        for group in groups:
+            group.sort(key=getkey, reverse=True)
+            for item in group:
+                indices.append(item[0])
+
+        return iter(indices)
+
+    def __len__(self):
+        return len(self.data_source)
+
+
+class DistributedLenSortGroupPoolSampler(Sampler):
+
+    def __init__(self,
+                 dataset,
+                 length_lst,
+                 group_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True):
+        super(DistributedLenSortGroupPoolSampler, self).__init__(dataset)
+
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'modelscope error: Requires distributed package to be available'
+                )
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'modelscope error: Requires distributed package to be available'
+                )
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.length_lst = length_lst
+        self.group_size = group_size
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.buckets = self.num_samples // group_size
+        self.shuffle = shuffle
+
+    def __iter__(self):
+
+        def getkey(item):
+            return item[1]
+
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        random_len_lst = [(i, self.length_lst[i]) for i in indices]
+
+        # Bucket examples based on similar output sequence length for efficiency:
+        groups = [
+            random_len_lst[i:i + self.group_size]
+            for i in range(0, self.num_samples, self.group_size)
+        ]
+        if (self.num_samples % self.group_size):
+            groups.append(random_len_lst[self.buckets * self.group_size:-1])
+
+        new_indices = []
+
+        for group in groups:
+            group.sort(key=getkey, reverse=True)
+            for item in group:
+                new_indices.append(item[0])
+
+        return iter(new_indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
--- a/modelscope/models/audio/tts/models/datasets/units/init.py
+++ b/modelscope/models/audio/tts/models/datasets/units/init.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .ling_unit import *  # noqa F403
--- a/modelscope/models/audio/tts/models/datasets/units/cleaners.py
+++ b/modelscope/models/audio/tts/models/datasets/units/cleaners.py
@@ -0,0 +1,88 @@
+# from https://github.com/keithito/tacotron
+# Cleaners are transformations that run over the input text at both training and eval time.
+#
+# Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+# hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+#   1. "english_cleaners" for English text
+#   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+#      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+#   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+#      the symbols in symbols.py to match your data).
+
+import re
+
+from unidecode import unidecode
+
+from .numbers import normalize_numbers
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+    for x in [('mrs', 'misess'),
+              ('mr', 'mister'),
+              ('dr', 'doctor'),
+              ('st', 'saint'),
+              ('co', 'company'),
+              ('jr', 'junior'),
+              ('maj', 'major'),
+              ('gen', 'general'),
+              ('drs', 'doctors'),
+              ('rev', 'reverend'),
+              ('lt', 'lieutenant'),
+              ('hon', 'honorable'),
+              ('sgt', 'sergeant'),
+              ('capt', 'captain'),
+              ('esq', 'esquire'),
+              ('ltd', 'limited'),
+              ('col', 'colonel'),
+              ('ft', 'fort'), ]]  # yapf:disable
+
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def expand_numbers(text):
+    return normalize_numbers(text)
+
+
+def lowercase(text):
+    return text.lower()
+
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+
+
+def convert_to_ascii(text):
+    return unidecode(text)
+
+
+def basic_cleaners(text):
+    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def transliteration_cleaners(text):
+    '''Pipeline for non-English text that transliterates to ASCII.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def english_cleaners(text):
+    '''Pipeline for English text, including number and abbreviation expansion.'''
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text
--- a/modelscope/models/audio/tts/models/datasets/units/ling_unit.py
+++ b/modelscope/models/audio/tts/models/datasets/units/ling_unit.py
@@ -0,0 +1,395 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import abc
+import codecs
+import os
+import re
+import shutil
+
+import json
+import numpy as np
+
+from . import cleaners as cleaners
+
+# Regular expression matching text enclosed in curly braces:
+_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
+
+
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception(
+                'modelscope error: configuration cleaner unknown: %s' % name)
+        text = cleaner(text)
+    return text
+
+
+class LinguisticBaseUnit(abc.ABC):
+
+    def set_config_params(self, config_params):
+        self.config_params = config_params
+
+    def save(self, config, config_name, path):
+        t_path = os.path.join(path, config_name)
+        if config != t_path:
+            os.makedirs(path, exist_ok=True)
+            shutil.copyfile(config, os.path.join(path, config_name))
+
+
+class KanTtsLinguisticUnit(LinguisticBaseUnit):
+
+    def __init__(self, config, path, has_mask=True):
+        super(KanTtsLinguisticUnit, self).__init__()
+
+        # special symbol
+        self._pad = '_'
+        self._eos = '~'
+        self._mask = '@[MASK]'
+        self._has_mask = has_mask
+        self._unit_config = config
+        self._path = path
+
+        self._cleaner_names = [
+            x.strip() for x in self._unit_config['cleaners'].split(',')
+        ]
+        self._lfeat_type_list = self._unit_config['lfeat_type_list'].strip(
+        ).split(',')
+
+        self.build()
+
+    def get_unit_size(self):
+        ling_unit_size = {}
+        ling_unit_size['sy'] = len(self.sy)
+        ling_unit_size['tone'] = len(self.tone)
+        ling_unit_size['syllable_flag'] = len(self.syllable_flag)
+        ling_unit_size['word_segment'] = len(self.word_segment)
+
+        if 'emo_category' in self._lfeat_type_list:
+            ling_unit_size['emotion'] = len(self.emo_category)
+        if 'speaker_category' in self._lfeat_type_list:
+            ling_unit_size['speaker'] = len(self.speaker)
+
+        return ling_unit_size
+
+    def build(self):
+
+        self._sub_unit_dim = {}
+        self._sub_unit_pad = {}
+        # sy sub-unit
+        _characters = ''
+
+        _ch_symbols = []
+
+        sy_path = os.path.join(self._path, self._unit_config['sy'])
+        f = codecs.open(sy_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_symbols.append(line)
+
+        _arpabet = ['@' + s for s in _ch_symbols]
+
+        # Export all symbols:
+        self.sy = list(_characters) + _arpabet + [self._pad, self._eos]
+        if self._has_mask:
+            self.sy.append(self._mask)
+        self._sy_to_id = {s: i for i, s in enumerate(self.sy)}
+        self._id_to_sy = {i: s for i, s in enumerate(self.sy)}
+        self._sub_unit_dim['sy'] = len(self.sy)
+        self._sub_unit_pad['sy'] = self._sy_to_id['_']
+
+        # tone sub-unit
+        _characters = ''
+
+        _ch_tones = []
+
+        tone_path = os.path.join(self._path, self._unit_config['tone'])
+        f = codecs.open(tone_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_tones.append(line)
+
+        # Export all tones:
+        self.tone = list(_characters) + _ch_tones + [self._pad, self._eos]
+        if self._has_mask:
+            self.tone.append(self._mask)
+        self._tone_to_id = {s: i for i, s in enumerate(self.tone)}
+        self._id_to_tone = {i: s for i, s in enumerate(self.tone)}
+        self._sub_unit_dim['tone'] = len(self.tone)
+        self._sub_unit_pad['tone'] = self._tone_to_id['_']
+
+        # syllable flag sub-unit
+        _characters = ''
+
+        _ch_syllable_flags = []
+
+        sy_flag_path = os.path.join(self._path,
+                                    self._unit_config['syllable_flag'])
+        f = codecs.open(sy_flag_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_syllable_flags.append(line)
+
+        # Export all syllable_flags:
+        self.syllable_flag = list(_characters) + _ch_syllable_flags + [
+            self._pad, self._eos
+        ]
+        if self._has_mask:
+            self.syllable_flag.append(self._mask)
+        self._syllable_flag_to_id = {
+            s: i
+            for i, s in enumerate(self.syllable_flag)
+        }
+        self._id_to_syllable_flag = {
+            i: s
+            for i, s in enumerate(self.syllable_flag)
+        }
+        self._sub_unit_dim['syllable_flag'] = len(self.syllable_flag)
+        self._sub_unit_pad['syllable_flag'] = self._syllable_flag_to_id['_']
+
+        # word segment sub-unit
+        _characters = ''
+
+        _ch_word_segments = []
+
+        ws_path = os.path.join(self._path, self._unit_config['word_segment'])
+        f = codecs.open(ws_path, 'r')
+        for line in f:
+            line = line.strip('\r\n')
+            _ch_word_segments.append(line)
+
+        # Export all syllable_flags:
+        self.word_segment = list(_characters) + _ch_word_segments + [
+            self._pad, self._eos
+        ]
+        if self._has_mask:
+            self.word_segment.append(self._mask)
+        self._word_segment_to_id = {
+            s: i
+            for i, s in enumerate(self.word_segment)
+        }
+        self._id_to_word_segment = {
+            i: s
+            for i, s in enumerate(self.word_segment)
+        }
+        self._sub_unit_dim['word_segment'] = len(self.word_segment)
+        self._sub_unit_pad['word_segment'] = self._word_segment_to_id['_']
+
+        if 'emo_category' in self._lfeat_type_list:
+            # emotion category sub-unit
+            _characters = ''
+
+            _ch_emo_types = []
+
+            emo_path = os.path.join(self._path,
+                                    self._unit_config['emo_category'])
+            f = codecs.open(emo_path, 'r')
+            for line in f:
+                line = line.strip('\r\n')
+                _ch_emo_types.append(line)
+
+            self.emo_category = list(_characters) + _ch_emo_types + [
+                self._pad, self._eos
+            ]
+            if self._has_mask:
+                self.emo_category.append(self._mask)
+            self._emo_category_to_id = {
+                s: i
+                for i, s in enumerate(self.emo_category)
+            }
+            self._id_to_emo_category = {
+                i: s
+                for i, s in enumerate(self.emo_category)
+            }
+            self._sub_unit_dim['emo_category'] = len(self.emo_category)
+            self._sub_unit_pad['emo_category'] = self._emo_category_to_id['_']
+
+        if 'speaker_category' in self._lfeat_type_list:
+            # speaker category sub-unit
+            _characters = ''
+
+            _ch_speakers = []
+
+            speaker_path = os.path.join(self._path,
+                                        self._unit_config['speaker_category'])
+            f = codecs.open(speaker_path, 'r')
+            for line in f:
+                line = line.strip('\r\n')
+                _ch_speakers.append(line)
+
+            # Export all syllable_flags:
+            self.speaker = list(_characters) + _ch_speakers + [
+                self._pad, self._eos
+            ]
+            if self._has_mask:
+                self.speaker.append(self._mask)
+            self._speaker_to_id = {s: i for i, s in enumerate(self.speaker)}
+            self._id_to_speaker = {i: s for i, s in enumerate(self.speaker)}
+            self._sub_unit_dim['speaker_category'] = len(self._speaker_to_id)
+            self._sub_unit_pad['speaker_category'] = self._speaker_to_id['_']
+
+    def encode_symbol_sequence(self, lfeat_symbol):
+        lfeat_symbol = lfeat_symbol.strip().split(' ')
+
+        lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list))
+        for this_lfeat_symbol in lfeat_symbol:
+            this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
+                '$')
+            index = 0
+            while index < len(lfeat_symbol_separate):
+                lfeat_symbol_separate[index] = lfeat_symbol_separate[
+                    index] + this_lfeat_symbol[index] + ' '
+                index = index + 1
+
+        input_and_label_data = []
+        index = 0
+        while index < len(self._lfeat_type_list):
+            sequence = self.encode_sub_unit(
+                lfeat_symbol_separate[index].strip(),
+                self._lfeat_type_list[index])
+            sequence_array = np.asarray(sequence, dtype=np.int32)
+            input_and_label_data.append(sequence_array)
+            index = index + 1
+
+        return input_and_label_data
+
+    def decode_symbol_sequence(self, sequence):
+        result = []
+        for i, lfeat_type in enumerate(self._lfeat_type_list):
+            s = ''
+            sequence_item = sequence[i].tolist()
+            if lfeat_type == 'sy':
+                s = self.decode_sy(sequence_item)
+            elif lfeat_type == 'tone':
+                s = self.decode_tone(sequence_item)
+            elif lfeat_type == 'syllable_flag':
+                s = self.decode_syllable_flag(sequence_item)
+            elif lfeat_type == 'word_segment':
+                s = self.decode_word_segment(sequence_item)
+            elif lfeat_type == 'emo_category':
+                s = self.decode_emo_category(sequence_item)
+            elif lfeat_type == 'speaker_category':
+                s = self.decode_speaker_category(sequence_item)
+            else:
+                raise Exception(
+                    'modelscope error: configuration lfeat type(%s) unknown.'
+                    % lfeat_type)
+            result.append('%s:%s' % (lfeat_type, s))
+
+        return result
+
+    def encode_sub_unit(self, this_lfeat_symbol, lfeat_type):
+        sequence = []
+        if lfeat_type == 'sy':
+            this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
+            this_lfeat_symbol_format = ''
+            index = 0
+            while index < len(this_lfeat_symbol):
+                this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
+                    index] + '}' + ' '
+                index = index + 1
+            sequence = self.encode_text(this_lfeat_symbol_format,
+                                        self._cleaner_names)
+        elif lfeat_type == 'tone':
+            sequence = self.encode_tone(this_lfeat_symbol)
+        elif lfeat_type == 'syllable_flag':
+            sequence = self.encode_syllable_flag(this_lfeat_symbol)
+        elif lfeat_type == 'word_segment':
+            sequence = self.encode_word_segment(this_lfeat_symbol)
+        elif lfeat_type == 'emo_category':
+            sequence = self.encode_emo_category(this_lfeat_symbol)
+        elif lfeat_type == 'speaker_category':
+            sequence = self.encode_speaker_category(this_lfeat_symbol)
+        else:
+            raise Exception(
+                'modelscope error: configuration lfeat type(%s) unknown.'
+                % lfeat_type)
+
+        return sequence
+
+    def encode_text(self, text, cleaner_names):
+        sequence = []
+
+        # Check for curly braces and treat their contents as ARPAbet:
+        while len(text):
+            m = _curly_re.match(text)
+            if not m:
+                sequence += self.encode_sy(_clean_text(text, cleaner_names))
+                break
+            sequence += self.encode_sy(_clean_text(m.group(1), cleaner_names))
+            sequence += self.encode_arpanet(m.group(2))
+            text = m.group(3)
+
+        # Append EOS token
+        sequence.append(self._sy_to_id['~'])
+        return sequence
+
+    def encode_sy(self, sy):
+        return [self._sy_to_id[s] for s in sy if self.should_keep_sy(s)]
+
+    def decode_sy(self, id):
+        s = self._id_to_sy[id]
+        if len(s) > 1 and s[0] == '@':
+            s = s[1:]
+        return s
+
+    def should_keep_sy(self, s):
+        return s in self._sy_to_id and s != '_' and s != '~'
+
+    def encode_arpanet(self, text):
+        return self.encode_sy(['@' + s for s in text.split()])
+
+    def encode_tone(self, tone):
+        tones = tone.strip().split(' ')
+        sequence = []
+        for this_tone in tones:
+            sequence.append(self._tone_to_id[this_tone])
+        sequence.append(self._tone_to_id['~'])
+        return sequence
+
+    def decode_tone(self, id):
+        return self._id_to_tone[id]
+
+    def encode_syllable_flag(self, syllable_flag):
+        syllable_flags = syllable_flag.strip().split(' ')
+        sequence = []
+        for this_syllable_flag in syllable_flags:
+            sequence.append(self._syllable_flag_to_id[this_syllable_flag])
+        sequence.append(self._syllable_flag_to_id['~'])
+        return sequence
+
+    def decode_syllable_flag(self, id):
+        return self._id_to_syllable_flag[id]
+
+    def encode_word_segment(self, word_segment):
+        word_segments = word_segment.strip().split(' ')
+        sequence = []
+        for this_word_segment in word_segments:
+            sequence.append(self._word_segment_to_id[this_word_segment])
+        sequence.append(self._word_segment_to_id['~'])
+        return sequence
+
+    def decode_word_segment(self, id):
+        return self._id_to_word_segment[id]
+
+    def encode_emo_category(self, emo_type):
+        emo_categories = emo_type.strip().split(' ')
+        sequence = []
+        for this_category in emo_categories:
+            sequence.append(self._emo_category_to_id[this_category])
+        sequence.append(self._emo_category_to_id['~'])
+        return sequence
+
+    def decode_emo_category(self, id):
+        return self._id_to_emo_category[id]
+
+    def encode_speaker_category(self, speaker):
+        speakers = speaker.strip().split(' ')
+        sequence = []
+        for this_speaker in speakers:
+            sequence.append(self._speaker_to_id[this_speaker])
+        sequence.append(self._speaker_to_id['~'])
+        return sequence
+
+    def decode_speaker_category(self, id):
+        return self._id_to_speaker[id]
--- a/modelscope/models/audio/tts/models/datasets/units/numbers.py
+++ b/modelscope/models/audio/tts/models/datasets/units/numbers.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from tacotron,
+# made publicly available under the MIT License at https://github.com/keithito/tacotron
+
 import re

 import inflect
--- a/modelscope/models/audio/tts/models/fsmn.py
+++ b/modelscope/models/audio/tts/models/fsmn.py
@@ -1,273 +0,0 @@
-import tensorflow as tf
-
-
-def build_sequence_mask(sequence_length,
-                        maximum_length=None,
-                        dtype=tf.float32):
-    """Builds the dot product mask.
-
-    Args:
-      sequence_length: The sequence length.
-      maximum_length: Optional size of the returned time dimension. Otherwise
-        it is the maximum of :obj:`sequence_length`.
-      dtype: The type of the mask tensor.
-
-    Returns:
-      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
-      ``[batch_size, max_length]``.
-    """
-    mask = tf.sequence_mask(
-        sequence_length, maxlen=maximum_length, dtype=dtype)
-
-    return mask
-
-
-def norm(inputs):
-    """Layer normalizes :obj:`inputs`."""
-    return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)
-
-
-def pad_in_time(x, padding_shape):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.
-
-       Agrs:
-        x: [Batch, Time, Frequency]
-        padding_length: padding size of constant value (0) before the time dimension
-
-      return:
-        padded x
-    """
-
-    depth = x.get_shape().as_list()[-1]
-    x = tf.pad(x, [[0, 0], padding_shape, [0, 0]])
-    x.set_shape((None, None, depth))
-
-    return x
-
-
-def pad_in_time_right(x, padding_length):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.
-
-       Agrs:
-        x: [Batch, Time, Frequency]
-        padding_length: padding size of constant value (0) before the time dimension
-
-      return:
-        padded x
-    """
-    depth = x.get_shape().as_list()[-1]
-    x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
-    x.set_shape((None, None, depth))
-
-    return x
-
-
-def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0):
-    """Implements the Transformer's "Feed Forward" layer.
-
-    .. math::
-
-        ffn(x) = max(0, x*W_1 + b_1)*W_2
-
-    Args:
-      x: The input.
-      ffn_dim: The number of units of the nonlinear transformation.
-      memory_units: the number of units of linear transformation
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units from the inner transformation.
-
-    Returns:
-      The transformed input.
-    """
-    inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu)
-    inner = tf.layers.dropout(
-        inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN)
-    outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False)
-
-    return outer
-
-
-def drop_and_add(inputs, outputs, mode, dropout=0.0):
-    """Drops units in the outputs and adds the previous values.
-
-    Args:
-      inputs: The input of the previous layer.
-      outputs: The output of the previous layer.
-      mode: A ``tf.estimator.ModeKeys`` mode.
-      dropout: The probability to drop units in :obj:`outputs`.
-
-    Returns:
-      The residual and normalized output.
-    """
-    outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)
-
-    input_dim = inputs.get_shape().as_list()[-1]
-    output_dim = outputs.get_shape().as_list()[-1]
-
-    if input_dim == output_dim:
-        outputs += inputs
-
-    return outputs
-
-
-def MemoryBlock(
-    inputs,
-    filter_size,
-    mode,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the bidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      mode: Training or Evaluation
-      mask: A ``tf.Tensor`` applied to the memory block output
-
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    inputs = tf.expand_dims(inputs, axis=1)  # [Batch, 1, Time, Frequency]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=inputs,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='SAME',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    output = tf.reshape(
-        output,
-        [tf.shape(output)[0], tf.shape(output)[2], depth])
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
-
-
-def MemoryBlockV2(
-    inputs,
-    filter_size,
-    mode,
-    shift=0,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the bidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      mode: Training or Evaluation
-      shift: left padding, to control delay
-      mask: A ``tf.Tensor`` applied to the memory block output
-
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    if mask is not None:
-        inputs = inputs * tf.expand_dims(mask, -1)
-
-    static_shape = inputs.get_shape().as_list()
-    depth = static_shape[-1]
-    # padding
-    left_padding = int(round((filter_size - 1) / 2))
-    right_padding = int((filter_size - 1) / 2)
-    if shift > 0:
-        left_padding = left_padding + shift
-        right_padding = right_padding - shift
-    pad_inputs = pad_in_time(inputs, [left_padding, right_padding])
-    pad_inputs = tf.expand_dims(
-        pad_inputs, axis=1)  # [Batch, 1, Time, Frequency]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=pad_inputs,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='VALID',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = tf.reshape(
-        memory,
-        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
-
-
-def UniMemoryBlock(
-    inputs,
-    filter_size,
-    mode,
-    cache=None,
-    mask=None,
-    dropout=0.0,
-):
-    """
-    Define the unidirectional memory block in FSMN
-
-    Agrs:
-      inputs: The output of the previous layer. [Batch, Time, Frequency]
-      filter_size: memory block filter size
-      cache: for streaming inference
-      mode: Training or Evaluation
-      mask: A ``tf.Tensor`` applied to the memory block output
-      dropout: dorpout factor
-    return:
-      output: 3-D tensor ([Batch, Time, Frequency])
-    """
-    if cache is not None:
-        static_shape = cache['queries'].get_shape().as_list()
-        depth = static_shape[-1]
-        queries = tf.slice(cache['queries'], [0, 1, 0], [
-            tf.shape(cache['queries'])[0],
-            tf.shape(cache['queries'])[1] - 1, depth
-        ])
-        queries = tf.concat([queries, inputs], axis=1)
-        cache['queries'] = queries
-    else:
-        padding_length = filter_size - 1
-        queries = pad_in_time(inputs, [padding_length, 0])
-
-    queries = tf.expand_dims(queries, axis=1)  # [Batch, 1, Time, Frequency]
-    static_shape = queries.get_shape().as_list()
-    depth = static_shape[-1]
-    depthwise_filter = tf.get_variable(
-        'depth_conv_w',
-        shape=[1, filter_size, depth, 1],
-        initializer=tf.glorot_uniform_initializer(),
-        dtype=tf.float32)
-    memory = tf.nn.depthwise_conv2d(
-        input=queries,
-        filter=depthwise_filter,
-        strides=[1, 1, 1, 1],
-        padding='VALID',
-        rate=[1, 1],
-        data_format='NHWC')
-    memory = tf.reshape(
-        memory,
-        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
-    memory = memory + inputs
-    output = tf.layers.dropout(memory, rate=dropout, training=mode)
-    if mask is not None:
-        output = output * tf.expand_dims(mask, -1)
-
-    return output
--- a/modelscope/models/audio/tts/models/fsmn_encoder.py
+++ b/modelscope/models/audio/tts/models/fsmn_encoder.py
@@ -1,178 +0,0 @@
-import tensorflow as tf
-
-from . import fsmn
-
-
-class FsmnEncoder():
-    """Encoder using Fsmn
-    """
-
-    def __init__(self,
-                 filter_size,
-                 fsmn_num_layers,
-                 dnn_num_layers,
-                 num_memory_units=512,
-                 ffn_inner_dim=2048,
-                 dropout=0.0,
-                 position_encoder=None):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          filter_size: the total order of memory block
-          fsmn_num_layers: The number of fsmn layers.
-          dnn_num_layers: The number of dnn layers
-          num_units: The number of memory units.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(FsmnEncoder, self).__init__()
-        self.filter_size = filter_size
-        self.fsmn_num_layers = fsmn_num_layers
-        self.dnn_num_layers = dnn_num_layers
-        self.num_memory_units = num_memory_units
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        mask = fsmn.build_sequence_mask(
-            sequence_length, maximum_length=tf.shape(inputs)[1])
-
-        state = ()
-
-        for layer in range(self.fsmn_num_layers):
-            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
-                with tf.variable_scope('ffn'):
-                    context = fsmn.feed_forward(
-                        inputs,
-                        self.ffn_inner_dim,
-                        self.num_memory_units,
-                        mode,
-                        dropout=self.dropout)
-
-                with tf.variable_scope('memory'):
-                    memory = fsmn.MemoryBlock(
-                        context,
-                        self.filter_size,
-                        mode,
-                        mask=mask,
-                        dropout=self.dropout)
-
-                    memory = fsmn.drop_and_add(
-                        inputs, memory, mode, dropout=self.dropout)
-
-                inputs = memory
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        for layer in range(self.dnn_num_layers):
-            with tf.variable_scope('dnn_layer_{}'.format(layer)):
-                transformed = fsmn.feed_forward(
-                    inputs,
-                    self.ffn_inner_dim,
-                    self.num_memory_units,
-                    mode,
-                    dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = inputs
-        return (outputs, state, sequence_length)
-
-
-class FsmnEncoderV2():
-    """Encoder using Fsmn
-    """
-
-    def __init__(self,
-                 filter_size,
-                 fsmn_num_layers,
-                 dnn_num_layers,
-                 num_memory_units=512,
-                 ffn_inner_dim=2048,
-                 dropout=0.0,
-                 shift=0,
-                 position_encoder=None):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          filter_size: the total order of memory block
-          fsmn_num_layers: The number of fsmn layers.
-          dnn_num_layers: The number of dnn layers
-          num_units: The number of memory units.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          shift: left padding, to control delay
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(FsmnEncoderV2, self).__init__()
-        self.filter_size = filter_size
-        self.fsmn_num_layers = fsmn_num_layers
-        self.dnn_num_layers = dnn_num_layers
-        self.num_memory_units = num_memory_units
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.shift = shift
-        if not isinstance(shift, list):
-            self.shift = [shift for _ in range(self.fsmn_num_layers)]
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        mask = fsmn.build_sequence_mask(
-            sequence_length, maximum_length=tf.shape(inputs)[1])
-
-        state = ()
-        for layer in range(self.fsmn_num_layers):
-            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
-                with tf.variable_scope('ffn'):
-                    context = fsmn.feed_forward(
-                        inputs,
-                        self.ffn_inner_dim,
-                        self.num_memory_units,
-                        mode,
-                        dropout=self.dropout)
-
-                with tf.variable_scope('memory'):
-                    memory = fsmn.MemoryBlockV2(
-                        context,
-                        self.filter_size,
-                        mode,
-                        shift=self.shift[layer],
-                        mask=mask,
-                        dropout=self.dropout)
-
-                    memory = fsmn.drop_and_add(
-                        inputs, memory, mode, dropout=self.dropout)
-
-                inputs = memory
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        for layer in range(self.dnn_num_layers):
-            with tf.variable_scope('dnn_layer_{}'.format(layer)):
-                transformed = fsmn.feed_forward(
-                    inputs,
-                    self.ffn_inner_dim,
-                    self.num_memory_units,
-                    mode,
-                    dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = inputs
-        return (outputs, state, sequence_length)
--- a/modelscope/models/audio/tts/models/helpers.py
+++ b/modelscope/models/audio/tts/models/helpers.py
@@ -1,159 +0,0 @@
-import numpy as np
-import tensorflow as tf
-
-
-class VarTestHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, batch_size, inputs, dim):
-        with tf.name_scope('VarTestHelper'):
-            self._batch_size = batch_size
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._inputs)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope('VarTestHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs = tf.concat([outputs, self._inputs[:, time, :]],
-                                    axis=-1)
-            return (finished, next_inputs, state)
-
-
-class VarTrainingHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, targets, inputs, dim):
-        with tf.name_scope('VarTrainingHelper'):
-            self._targets = targets  # [N, T_in, 1]
-            self._batch_size = tf.shape(inputs)[0]  # N
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._targets)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope(name or 'VarTrainingHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs = tf.concat(
-                [self._targets[:, time, :], self._inputs[:, time, :]], axis=-1)
-            return (finished, next_inputs, state)
-
-
-class VarTrainingSSHelper(tf.contrib.seq2seq.Helper):
-
-    def __init__(self, targets, inputs, dim, global_step, schedule_begin,
-                 alpha, decay_steps):
-        with tf.name_scope('VarTrainingSSHelper'):
-            self._targets = targets  # [N, T_in, 1]
-            self._batch_size = tf.shape(inputs)[0]  # N
-            self._inputs = inputs
-            self._dim = dim
-
-            num_steps = tf.shape(self._targets)[1]
-            self._lengths = tf.tile([num_steps], [self._batch_size])
-
-            self._inputs = tf.roll(inputs, shift=-1, axis=1)
-            self._init_inputs = inputs[:, 0, :]
-
-            # for schedule sampling
-            self._global_step = global_step
-            self._schedule_begin = schedule_begin
-            self._alpha = alpha
-            self._decay_steps = decay_steps
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def sample_ids_shape(self):
-        return tf.TensorShape([])
-
-    @property
-    def sample_ids_dtype(self):
-        return np.int32
-
-    def initialize(self, name=None):
-        self._ratio = _tf_decay(self._global_step, self._schedule_begin,
-                                self._alpha, self._decay_steps)
-        return (tf.tile([False], [self._batch_size]),
-                _go_frames(self._batch_size, self._dim, self._init_inputs))
-
-    def sample(self, time, outputs, state, name=None):
-        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
-
-    def next_inputs(self, time, outputs, state, sample_ids, name=None):
-        with tf.name_scope(name or 'VarTrainingHelper'):
-            finished = (time + 1 >= self._lengths)
-            next_inputs_tmp = tf.cond(
-                tf.less(
-                    tf.random_uniform([], minval=0, maxval=1,
-                                      dtype=tf.float32), self._ratio),
-                lambda: self._targets[:, time, :], lambda: outputs)
-            next_inputs = tf.concat(
-                [next_inputs_tmp, self._inputs[:, time, :]], axis=-1)
-            return (finished, next_inputs, state)
-
-
-def _go_frames(batch_size, dim, init_inputs):
-    '''Returns all-zero <GO> frames for a given batch size and output dimension'''
-    return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs],
-                     axis=-1)
-
-
-def _tf_decay(global_step, schedule_begin, alpha, decay_steps):
-    tfr = tf.train.exponential_decay(
-        1.0,
-        global_step=global_step - schedule_begin,
-        decay_steps=decay_steps,
-        decay_rate=alpha,
-        name='tfr_decay')
-    final_tfr = tf.cond(
-        tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr)
-    return final_tfr
--- a/modelscope/models/audio/tts/models/models/init.py
+++ b/modelscope/models/audio/tts/models/models/init.py
--- a/modelscope/models/audio/tts/models/models/hifigan/init.py
+++ b/modelscope/models/audio/tts/models/models/hifigan/init.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .hifigan import *  # noqa F403
--- a/modelscope/models/audio/tts/models/models/hifigan/hifigan.py
+++ b/modelscope/models/audio/tts/models/models/hifigan/hifigan.py
@@ -0,0 +1,238 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed from https://github.com/jik876/hifi-gan
+
+from distutils.version import LooseVersion
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+
+from modelscope.models.audio.tts.models.utils import get_padding, init_weights
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')
+
+
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+
+    Returns:
+        Tensor: Magnitude spectrogram (B).
+
+    """
+    if is_pytorch_17plus:
+        x_stft = torch.stft(
+            x, fft_size, hop_size, win_length, window, return_complex=False)
+    else:
+        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
+    real = x_stft[..., 0]
+    imag = x_stft[..., 1]
+
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
+
+
+LRELU_SLOPE = 0.1
+
+
+def get_padding_casual(kernel_size, dilation=1):
+    return int(kernel_size * dilation - dilation)
+
+
+class Conv1dCasual(torch.nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 padding_mode='zeros'):
+        super(Conv1dCasual, self).__init__()
+        self.pad = padding
+        self.conv1d = weight_norm(
+            Conv1d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=0,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+                padding_mode=padding_mode))
+        self.conv1d.apply(init_weights)
+
+    def forward(self, x):  # bdt
+        # described starting from the last dimension and moving forward.
+        x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
+        x = self.conv1d(x)
+        return x
+
+    def remove_weight_norm(self):
+        remove_weight_norm(self.conv1d)
+
+
+class ConvTranspose1dCausal(torch.nn.Module):
+    """CausalConvTranspose1d module with customized initialization."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        """Initialize CausalConvTranspose1d module."""
+        super(ConvTranspose1dCausal, self).__init__()
+        self.deconv = weight_norm(
+            ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
+        self.stride = stride
+        self.deconv.apply(init_weights)
+        self.pad = kernel_size - stride
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
+        """
+        # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
+        return self.deconv(x)[:, :, :-self.pad]
+
+    def remove_weight_norm(self):
+        remove_weight_norm(self.deconv)
+
+
+class ResBlock1(torch.nn.Module):
+
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            Conv1dCasual(
+                channels,
+                channels,
+                kernel_size,
+                1,
+                dilation=dilation[i],
+                padding=get_padding_casual(kernel_size, dilation[i]))
+            for i in range(len(dilation))
+        ])
+
+        self.convs2 = nn.ModuleList([
+            Conv1dCasual(
+                channels,
+                channels,
+                kernel_size,
+                1,
+                dilation=1,
+                padding=get_padding_casual(kernel_size, 1))
+            for i in range(len(dilation))
+        ])
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            layer.remove_weight_norm()
+        for layer in self.convs2:
+            layer.remove_weight_norm()
+
+
+class Generator(torch.nn.Module):
+
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        logger.info('num_kernels={}, num_upsamples={}'.format(
+            self.num_kernels, self.num_upsamples))
+        self.conv_pre = Conv1dCasual(
+            80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        self.repeat_ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+                zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            upsample = nn.Sequential(
+                nn.Upsample(mode='nearest', scale_factor=u),
+                nn.LeakyReLU(LRELU_SLOPE),
+                Conv1dCasual(
+                    h.upsample_initial_channel // (2**i),
+                    h.upsample_initial_channel // (2**(i + 1)),
+                    kernel_size=7,
+                    stride=1,
+                    padding=7 - 1))
+            self.repeat_ups.append(upsample)
+            self.ups.append(
+                ConvTranspose1dCausal(
+                    h.upsample_initial_channel // (2**i),
+                    h.upsample_initial_channel // (2**(i + 1)),
+                    k,
+                    u,
+                    padding=(k - u) // 2))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+
+        self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = torch.sin(x) + x
+            # transconv
+            x1 = F.leaky_relu(x, LRELU_SLOPE)
+            x1 = self.ups[i](x1)
+            # repeat
+            x2 = self.repeat_ups[i](x)
+            x = x1 + x2
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        logger.info('Removing weight norm...')
+        for layer in self.ups:
+            layer.remove_weight_norm()
+        for layer in self.repeat_ups:
+            layer[-1].remove_weight_norm()
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        self.conv_pre.remove_weight_norm()
+        self.conv_post.remove_weight_norm()
--- a/modelscope/models/audio/tts/models/models/sambert/init.py
+++ b/modelscope/models/audio/tts/models/models/sambert/init.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .kantts_sambert import *  # noqa F403
--- a/modelscope/models/audio/tts/models/models/sambert/adaptors.py
+++ b/modelscope/models/audio/tts/models/models/sambert/adaptors.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .base import Prenet
+from .fsmn import FsmnEncoderV2
+
+
+class LengthRegulator(nn.Module):
+
+    def __init__(self, r=1):
+        super(LengthRegulator, self).__init__()
+
+        self.r = r
+
+    def forward(self, inputs, durations, masks=None):
+        reps = (durations + 0.5).long()
+        output_lens = reps.sum(dim=1)
+        max_len = output_lens.max()
+        reps_cumsum = torch.cumsum(
+            F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :]
+        range_ = torch.arange(max_len).to(inputs.device)[None, :, None]
+        mult = ((reps_cumsum[:, :, :-1] <= range_)
+                & (reps_cumsum[:, :, 1:] > range_))  # yapf:disable
+        mult = mult.float()
+        out = torch.matmul(mult, inputs)
+
+        if masks is not None:
+            out = out.masked_fill(masks.unsqueeze(-1), 0.0)
+
+        seq_len = out.size(1)
+        padding = self.r - int(seq_len) % self.r
+        if (padding < self.r):
+            out = F.pad(
+                out.transpose(1, 2), (0, padding, 0, 0, 0, 0), value=0.0)
+            out = out.transpose(1, 2)
+
+        return out, output_lens
+
+
+class VarRnnARPredictor(nn.Module):
+
+    def __init__(self, cond_units, prenet_units, rnn_units):
+        super(VarRnnARPredictor, self).__init__()
+
+        self.prenet = Prenet(1, prenet_units)
+        self.lstm = nn.LSTM(
+            prenet_units[-1] + cond_units,
+            rnn_units,
+            num_layers=2,
+            batch_first=True,
+            bidirectional=False)
+        self.fc = nn.Linear(rnn_units, 1)
+
+    def forward(self, inputs, cond, h=None, masks=None):
+        x = torch.cat([self.prenet(inputs), cond], dim=-1)
+        # The input can also be a packed variable length sequence,
+        # here we just omit it for simplicity due to the mask and uni-directional lstm.
+        x, h_new = self.lstm(x, h)
+
+        x = self.fc(x).squeeze(-1)
+        x = F.relu(x)
+
+        if masks is not None:
+            x = x.masked_fill(masks, 0.0)
+
+        return x, h_new
+
+    def infer(self, cond, masks=None):
+        batch_size, length = cond.size(0), cond.size(1)
+
+        output = []
+        x = torch.zeros((batch_size, 1)).to(cond.device)
+        h = None
+
+        for i in range(length):
+            x, h = self.forward(x.unsqueeze(1), cond[:, i:i + 1, :], h=h)
+            output.append(x)
+
+        output = torch.cat(output, dim=-1)
+
+        if masks is not None:
+            output = output.masked_fill(masks, 0.0)
+
+        return output
+
+
+class VarFsmnRnnNARPredictor(nn.Module):
+
+    def __init__(self, in_dim, filter_size, fsmn_num_layers, num_memory_units,
+                 ffn_inner_dim, dropout, shift, lstm_units):
+        super(VarFsmnRnnNARPredictor, self).__init__()
+
+        self.fsmn = FsmnEncoderV2(filter_size, fsmn_num_layers, in_dim,
+                                  num_memory_units, ffn_inner_dim, dropout,
+                                  shift)
+        self.blstm = nn.LSTM(
+            num_memory_units,
+            lstm_units,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True)
+        self.fc = nn.Linear(2 * lstm_units, 1)
+
+    def forward(self, inputs, masks=None):
+        input_lengths = None
+        if masks is not None:
+            input_lengths = torch.sum((~masks).float(), dim=1).long()
+
+        x = self.fsmn(inputs, masks)
+
+        if input_lengths is not None:
+            x = nn.utils.rnn.pack_padded_sequence(
+                x,
+                input_lengths.tolist(),
+                batch_first=True,
+                enforce_sorted=False)
+            x, _ = self.blstm(x)
+            x, _ = nn.utils.rnn.pad_packed_sequence(
+                x, batch_first=True, total_length=inputs.size(1))
+        else:
+            x, _ = self.blstm(x)
+
+        x = self.fc(x).squeeze(-1)
+
+        if masks is not None:
+            x = x.masked_fill(masks, 0.0)
+
+        return x
--- a/modelscope/models/audio/tts/models/models/sambert/base.py
+++ b/modelscope/models/audio/tts/models/models/sambert/base.py
@@ -0,0 +1,369 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ScaledDotProductAttention(nn.Module):
+    """ Scaled Dot-Product Attention """
+
+    def __init__(self, temperature, dropatt=0.0):
+        super().__init__()
+        self.temperature = temperature
+        self.softmax = nn.Softmax(dim=2)
+        self.dropatt = nn.Dropout(dropatt)
+
+    def forward(self, q, k, v, mask=None):
+
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn / self.temperature
+
+        if mask is not None:
+            attn = attn.masked_fill(mask, -np.inf)
+
+        attn = self.softmax(attn)
+        attn = self.dropatt(attn)
+        output = torch.bmm(attn, v)
+
+        return output, attn
+
+
+class Prenet(nn.Module):
+
+    def __init__(self, in_units, prenet_units, out_units=0):
+        super(Prenet, self).__init__()
+
+        self.fcs = nn.ModuleList()
+        for in_dim, out_dim in zip([in_units] + prenet_units[:-1],
+                                   prenet_units):
+            self.fcs.append(nn.Linear(in_dim, out_dim))
+            self.fcs.append(nn.ReLU())
+            self.fcs.append(nn.Dropout(0.5))
+
+        if (out_units):
+            self.fcs.append(nn.Linear(prenet_units[-1], out_units))
+
+    def forward(self, input):
+        output = input
+        for layer in self.fcs:
+            output = layer(output)
+        return output
+
+
+class MultiHeadSelfAttention(nn.Module):
+    """ Multi-Head SelfAttention module """
+
+    def __init__(self, n_head, d_in, d_model, d_head, dropout, dropatt=0.0):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_in = d_in
+        self.d_model = d_model
+
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.w_qkv = nn.Linear(d_in, 3 * n_head * d_head)
+
+        self.attention = ScaledDotProductAttention(
+            temperature=np.power(d_head, 0.5), dropatt=dropatt)
+
+        self.fc = nn.Linear(n_head * d_head, d_model)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input, mask=None):
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_in, _ = input.size()
+
+        residual = input
+
+        x = self.layer_norm(input)
+        qkv = self.w_qkv(x)
+        q, k, v = qkv.chunk(3, -1)
+
+        q = q.view(sz_b, len_in, n_head, d_head)
+        k = k.view(sz_b, len_in, n_head, d_head)
+        v = v.view(sz_b, len_in, n_head, d_head)
+
+        q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+        k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+        v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_in,
+                                                    d_head)  # (n*b) x l x d
+
+        if mask is not None:
+            mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
+        output, attn = self.attention(q, k, v, mask=mask)
+
+        output = output.view(n_head, sz_b, len_in, d_head)
+        output = (output.permute(1, 2, 0,
+                                 3).contiguous().view(sz_b, len_in,
+                                                      -1))  # b x l x (n*d)
+
+        output = self.dropout(self.fc(output))
+        if (output.size(-1) == residual.size(-1)):
+            output = output + residual
+
+        return output, attn
+
+
+class PositionwiseConvFeedForward(nn.Module):
+    """ A two-feed-forward-layer module """
+
+    def __init__(self,
+                 d_in,
+                 d_hid,
+                 kernel_size=(3, 1),
+                 dropout_inner=0.1,
+                 dropout=0.1):
+        super().__init__()
+        # Use Conv1D
+        # position-wise
+        self.w_1 = nn.Conv1d(
+            d_in,
+            d_hid,
+            kernel_size=kernel_size[0],
+            padding=(kernel_size[0] - 1) // 2,
+        )
+        # position-wise
+        self.w_2 = nn.Conv1d(
+            d_hid,
+            d_in,
+            kernel_size=kernel_size[1],
+            padding=(kernel_size[1] - 1) // 2,
+        )
+
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.dropout_inner = nn.Dropout(dropout_inner)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask=None):
+        residual = x
+        x = self.layer_norm(x)
+
+        output = x.transpose(1, 2)
+        output = F.relu(self.w_1(output))
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(1), 0)
+        output = self.dropout_inner(output)
+        output = self.w_2(output)
+        output = output.transpose(1, 2)
+        output = self.dropout(output)
+
+        output = output + residual
+
+        return output
+
+
+class FFTBlock(nn.Module):
+    """FFT Block"""
+
+    def __init__(self,
+                 d_in,
+                 d_model,
+                 n_head,
+                 d_head,
+                 d_inner,
+                 kernel_size,
+                 dropout,
+                 dropout_attn=0.0,
+                 dropout_relu=0.0):
+        super(FFTBlock, self).__init__()
+        self.slf_attn = MultiHeadSelfAttention(
+            n_head,
+            d_in,
+            d_model,
+            d_head,
+            dropout=dropout,
+            dropatt=dropout_attn)
+        self.pos_ffn = PositionwiseConvFeedForward(
+            d_model,
+            d_inner,
+            kernel_size,
+            dropout_inner=dropout_relu,
+            dropout=dropout)
+
+    def forward(self, input, mask=None, slf_attn_mask=None):
+        output, slf_attn = self.slf_attn(input, mask=slf_attn_mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        output = self.pos_ffn(output, mask=mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output, slf_attn
+
+
+class MultiHeadPNCAAttention(nn.Module):
+    """ Multi-Head Attention PNCA module """
+
+    def __init__(self, n_head, d_model, d_mem, d_head, dropout, dropatt=0.0):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_model = d_model
+        self.d_mem = d_mem
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+
+        self.w_x_qkv = nn.Linear(d_model, 3 * n_head * d_head)
+        self.fc_x = nn.Linear(n_head * d_head, d_model)
+
+        self.w_h_kv = nn.Linear(d_mem, 2 * n_head * d_head)
+        self.fc_h = nn.Linear(n_head * d_head, d_model)
+
+        self.attention = ScaledDotProductAttention(
+            temperature=np.power(d_head, 0.5), dropatt=dropatt)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def update_x_state(self, x):
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_x, _ = x.size()
+
+        x_qkv = self.w_x_qkv(x)
+        x_q, x_k, x_v = x_qkv.chunk(3, -1)
+
+        x_q = x_q.view(sz_b, len_x, n_head, d_head)
+        x_k = x_k.view(sz_b, len_x, n_head, d_head)
+        x_v = x_v.view(sz_b, len_x, n_head, d_head)
+
+        x_q = x_q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+        x_k = x_k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+        x_v = x_v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head)
+
+        if (self.x_state_size):
+            self.x_k = torch.cat([self.x_k, x_k], dim=1)
+            self.x_v = torch.cat([self.x_v, x_v], dim=1)
+        else:
+            self.x_k = x_k
+            self.x_v = x_v
+
+        self.x_state_size += len_x
+
+        return x_q, x_k, x_v
+
+    def update_h_state(self, h):
+        if (self.h_state_size == h.size(1)):
+            return None, None
+
+        d_head, n_head = self.d_head, self.n_head
+
+        # H
+        sz_b, len_h, _ = h.size()
+
+        h_kv = self.w_h_kv(h)
+        h_k, h_v = h_kv.chunk(2, -1)
+
+        h_k = h_k.view(sz_b, len_h, n_head, d_head)
+        h_v = h_v.view(sz_b, len_h, n_head, d_head)
+
+        self.h_k = h_k.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head)
+        self.h_v = h_v.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head)
+
+        self.h_state_size += len_h
+
+        return h_k, h_v
+
+    def reset_state(self):
+        self.h_k = None
+        self.h_v = None
+        self.h_state_size = 0
+        self.x_k = None
+        self.x_v = None
+        self.x_state_size = 0
+
+    def forward(self, x, h, mask_x=None, mask_h=None):
+        residual = x
+        self.update_h_state(h)
+        x_q, x_k, x_v = self.update_x_state(self.layer_norm(x))
+
+        d_head, n_head = self.d_head, self.n_head
+
+        sz_b, len_in, _ = x.size()
+
+        # X
+        if mask_x is not None:
+            mask_x = mask_x.repeat(n_head, 1, 1)  # (n*b) x .. x ..
+        output_x, attn_x = self.attention(x_q, self.x_k, self.x_v, mask=mask_x)
+
+        output_x = output_x.view(n_head, sz_b, len_in, d_head)
+        output_x = (output_x.permute(1, 2, 0,
+                                     3).contiguous().view(sz_b, len_in,
+                                                          -1))  # b x l x (n*d)
+        output_x = self.fc_x(output_x)
+
+        # H
+        if mask_h is not None:
+            mask_h = mask_h.repeat(n_head, 1, 1)
+        output_h, attn_h = self.attention(x_q, self.h_k, self.h_v, mask=mask_h)
+
+        output_h = output_h.view(n_head, sz_b, len_in, d_head)
+        output_h = (output_h.permute(1, 2, 0,
+                                     3).contiguous().view(sz_b, len_in,
+                                                          -1))  # b x l x (n*d)
+        output_h = self.fc_h(output_h)
+
+        output = output_x + output_h
+
+        output = self.dropout(output)
+
+        output = output + residual
+
+        return output, attn_x, attn_h
+
+
+class PNCABlock(nn.Module):
+    """PNCA Block"""
+
+    def __init__(self,
+                 d_model,
+                 d_mem,
+                 n_head,
+                 d_head,
+                 d_inner,
+                 kernel_size,
+                 dropout,
+                 dropout_attn=0.0,
+                 dropout_relu=0.0):
+        super(PNCABlock, self).__init__()
+        self.pnca_attn = MultiHeadPNCAAttention(
+            n_head,
+            d_model,
+            d_mem,
+            d_head,
+            dropout=dropout,
+            dropatt=dropout_attn)
+        self.pos_ffn = PositionwiseConvFeedForward(
+            d_model,
+            d_inner,
+            kernel_size,
+            dropout_inner=dropout_relu,
+            dropout=dropout)
+
+    def forward(self,
+                input,
+                memory,
+                mask=None,
+                pnca_x_attn_mask=None,
+                pnca_h_attn_mask=None):
+        output, pnca_attn_x, pnca_attn_h = self.pnca_attn(
+            input, memory, pnca_x_attn_mask, pnca_h_attn_mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        output = self.pos_ffn(output, mask=mask)
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output, pnca_attn_x, pnca_attn_h
+
+    def reset_state(self):
+        self.pnca_attn.reset_state()
--- a/modelscope/models/audio/tts/models/models/sambert/fsmn.py
+++ b/modelscope/models/audio/tts/models/models/sambert/fsmn.py
@@ -0,0 +1,126 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+FSMN Pytorch Version
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FeedForwardNet(nn.Module):
+    """ A two-feed-forward-layer module """
+
+    def __init__(self, d_in, d_hid, d_out, kernel_size=[1, 1], dropout=0.1):
+        super().__init__()
+
+        # Use Conv1D
+        # position-wise
+        self.w_1 = nn.Conv1d(
+            d_in,
+            d_hid,
+            kernel_size=kernel_size[0],
+            padding=(kernel_size[0] - 1) // 2,
+        )
+        # position-wise
+        self.w_2 = nn.Conv1d(
+            d_hid,
+            d_out,
+            kernel_size=kernel_size[1],
+            padding=(kernel_size[1] - 1) // 2,
+            bias=False)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        output = x.transpose(1, 2)
+        output = F.relu(self.w_1(output))
+        output = self.dropout(output)
+        output = self.w_2(output)
+        output = output.transpose(1, 2)
+
+        return output
+
+
+class MemoryBlockV2(nn.Module):
+
+    def __init__(self, d, filter_size, shift, dropout=0.0):
+        super(MemoryBlockV2, self).__init__()
+
+        left_padding = int(round((filter_size - 1) / 2))
+        right_padding = int((filter_size - 1) / 2)
+        if shift > 0:
+            left_padding += shift
+            right_padding -= shift
+
+        self.lp, self.rp = left_padding, right_padding
+
+        self.conv_dw = nn.Conv1d(d, d, filter_size, 1, 0, groups=d, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input, mask=None):
+        if mask is not None:
+            input = input.masked_fill(mask.unsqueeze(-1), 0)
+
+        x = F.pad(
+            input, (0, 0, self.lp, self.rp, 0, 0), mode='constant', value=0.0)
+        output = self.conv_dw(x.contiguous().transpose(
+            1, 2)).contiguous().transpose(1, 2)
+        output += input
+        output = self.dropout(output)
+
+        if mask is not None:
+            output = output.masked_fill(mask.unsqueeze(-1), 0)
+
+        return output
+
+
+class FsmnEncoderV2(nn.Module):
+
+    def __init__(self,
+                 filter_size,
+                 fsmn_num_layers,
+                 input_dim,
+                 num_memory_units,
+                 ffn_inner_dim,
+                 dropout=0.0,
+                 shift=0):
+        super(FsmnEncoderV2, self).__init__()
+
+        self.filter_size = filter_size
+        self.fsmn_num_layers = fsmn_num_layers
+        self.num_memory_units = num_memory_units
+        self.ffn_inner_dim = ffn_inner_dim
+        self.dropout = dropout
+        self.shift = shift
+        if not isinstance(shift, list):
+            self.shift = [shift for _ in range(self.fsmn_num_layers)]
+
+        self.ffn_lst = nn.ModuleList()
+        self.ffn_lst.append(
+            FeedForwardNet(
+                input_dim, ffn_inner_dim, num_memory_units, dropout=dropout))
+        for i in range(1, fsmn_num_layers):
+            self.ffn_lst.append(
+                FeedForwardNet(
+                    num_memory_units,
+                    ffn_inner_dim,
+                    num_memory_units,
+                    dropout=dropout))
+
+        self.memory_block_lst = nn.ModuleList()
+        for i in range(fsmn_num_layers):
+            self.memory_block_lst.append(
+                MemoryBlockV2(num_memory_units, filter_size, self.shift[i],
+                              dropout))
+
+    def forward(self, input, mask=None):
+        x = F.dropout(input, self.dropout, self.training)
+        for (ffn, memory_block) in zip(self.ffn_lst, self.memory_block_lst):
+            context = ffn(x)
+            memory = memory_block(context, mask)
+            memory = F.dropout(memory, self.dropout, self.training)
+            if (memory.size(-1) == x.size(-1)):
+                memory += x
+            x = memory
+
+        return x
--- a/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
+++ b/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py
@@ -0,0 +1,718 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.audio.tts.models.utils import get_mask_from_lengths
+from .adaptors import (LengthRegulator, VarFsmnRnnNARPredictor,
+                       VarRnnARPredictor)
+from .base import FFTBlock, PNCABlock, Prenet
+from .fsmn import FsmnEncoderV2
+from .positions import DurSinusoidalPositionEncoder, SinusoidalPositionEncoder
+
+
+class SelfAttentionEncoder(nn.Module):
+
+    def __init__(self, n_layer, d_in, d_model, n_head, d_head, d_inner,
+                 dropout, dropout_att, dropout_relu, position_encoder):
+        super(SelfAttentionEncoder, self).__init__()
+
+        self.d_in = d_in
+        self.d_model = d_model
+        self.dropout = dropout
+        d_in_lst = [d_in] + [d_model] * (n_layer - 1)
+        self.fft = nn.ModuleList([
+            FFTBlock(d, d_model, n_head, d_head, d_inner, (3, 1), dropout,
+                     dropout_att, dropout_relu) for d in d_in_lst
+        ])
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.position_enc = position_encoder
+
+    def forward(self, input, mask=None, return_attns=False):
+        input *= self.d_model**0.5
+        if (isinstance(self.position_enc, SinusoidalPositionEncoder)):
+            input = self.position_enc(input)
+        else:
+            raise NotImplementedError('modelscope error: position_enc invalid')
+
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        enc_slf_attn_list = []
+        max_len = input.size(1)
+        if mask is not None:
+            slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            slf_attn_mask = None
+
+        enc_output = input
+        for id, layer in enumerate(self.fft):
+            enc_output, enc_slf_attn = layer(
+                enc_output, mask=mask, slf_attn_mask=slf_attn_mask)
+            if return_attns:
+                enc_slf_attn_list += [enc_slf_attn]
+
+        enc_output = self.ln(enc_output)
+
+        return enc_output, enc_slf_attn_list
+
+
+class HybridAttentionDecoder(nn.Module):
+
+    def __init__(self, d_in, prenet_units, n_layer, d_model, d_mem, n_head,
+                 d_head, d_inner, dropout, dropout_att, dropout_relu, d_out):
+        super(HybridAttentionDecoder, self).__init__()
+
+        self.d_model = d_model
+        self.dropout = dropout
+        self.prenet = Prenet(d_in, prenet_units, d_model)
+        self.dec_in_proj = nn.Linear(d_model + d_mem, d_model)
+        self.pnca = nn.ModuleList([
+            PNCABlock(d_model, d_mem, n_head, d_head, d_inner, (1, 1), dropout,
+                      dropout_att, dropout_relu) for _ in range(n_layer)
+        ])
+        self.ln = nn.LayerNorm(d_model, eps=1e-6)
+        self.dec_out_proj = nn.Linear(d_model, d_out)
+
+    def reset_state(self):
+        for layer in self.pnca:
+            layer.reset_state()
+
+    def get_pnca_attn_mask(self,
+                           device,
+                           max_len,
+                           x_band_width,
+                           h_band_width,
+                           mask=None):
+        if mask is not None:
+            pnca_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
+        else:
+            pnca_attn_mask = None
+
+        range_ = torch.arange(max_len).to(device)
+        x_start = torch.clamp_min(range_ - x_band_width, 0)[None, None, :]
+        x_end = (range_ + 1)[None, None, :]
+        h_start = range_[None, None, :]
+        h_end = torch.clamp_max(range_ + h_band_width + 1,
+                                max_len + 1)[None, None, :]
+
+        pnca_x_attn_mask = ~((x_start <= range_[None, :, None])
+                             & (x_end > range_[None, :, None])).transpose(1, 2)  # yapf:disable
+        pnca_h_attn_mask = ~((h_start <= range_[None, :, None])
+                             & (h_end > range_[None, :, None])).transpose(1, 2)  # yapf:disable
+
+        if pnca_attn_mask is not None:
+            pnca_x_attn_mask = (pnca_x_attn_mask | pnca_attn_mask)
+            pnca_h_attn_mask = (pnca_h_attn_mask | pnca_attn_mask)
+            pnca_x_attn_mask = pnca_x_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False)
+            pnca_h_attn_mask = pnca_h_attn_mask.masked_fill(
+                pnca_attn_mask.transpose(1, 2), False)
+
+        return pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask
+
+    # must call reset_state before
+    def forward(self,
+                input,
+                memory,
+                x_band_width,
+                h_band_width,
+                mask=None,
+                return_attns=False):
+        input = self.prenet(input)
+        input = torch.cat([memory, input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        if mask is not None:
+            input = input.masked_fill(mask.unsqueeze(-1), 0)
+
+        input *= self.d_model**0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        max_len = input.size(1)
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, mask)
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                mask=mask,
+                pnca_x_attn_mask=pnca_x_attn_mask,
+                pnca_h_attn_mask=pnca_h_attn_mask)
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+    # must call reset_state before when step == 0
+    def infer(self,
+              step,
+              input,
+              memory,
+              x_band_width,
+              h_band_width,
+              mask=None,
+              return_attns=False):
+        max_len = memory.size(1)
+
+        input = self.prenet(input)
+        input = torch.cat([memory[:, step:step + 1, :], input], dim=-1)
+        input = self.dec_in_proj(input)
+
+        input *= self.d_model**0.5
+        input = F.dropout(input, p=self.dropout, training=self.training)
+
+        pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask(
+            input.device, max_len, x_band_width, h_band_width, mask)
+
+        dec_pnca_attn_x_list = []
+        dec_pnca_attn_h_list = []
+        dec_output = input
+        for id, layer in enumerate(self.pnca):
+            if mask is not None:
+                mask_step = mask[:, step:step + 1]
+            else:
+                mask_step = None
+            dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer(
+                dec_output,
+                memory,
+                mask=mask_step,
+                pnca_x_attn_mask=pnca_x_attn_mask[:,
+                                                  step:step + 1, :(step + 1)],
+                pnca_h_attn_mask=pnca_h_attn_mask[:, step:step + 1, :])
+            if return_attns:
+                dec_pnca_attn_x_list += [dec_pnca_attn_x]
+                dec_pnca_attn_h_list += [dec_pnca_attn_h]
+
+        dec_output = self.ln(dec_output)
+        dec_output = self.dec_out_proj(dec_output)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+
+class TextFftEncoder(nn.Module):
+
+    def __init__(self, config, ling_unit_size):
+        super(TextFftEncoder, self).__init__()
+
+        # linguistic unit lookup table
+        nb_ling_sy = ling_unit_size['sy']
+        nb_ling_tone = ling_unit_size['tone']
+        nb_ling_syllable_flag = ling_unit_size['syllable_flag']
+        nb_ling_ws = ling_unit_size['word_segment']
+
+        max_len = config['am']['max_len']
+
+        d_emb = config['am']['embedding_dim']
+        nb_layers = config['am']['encoder_num_layers']
+        nb_heads = config['am']['encoder_num_heads']
+        d_model = config['am']['encoder_num_units']
+        d_head = d_model // nb_heads
+        d_inner = config['am']['encoder_ffn_inner_dim']
+        dropout = config['am']['encoder_dropout']
+        dropout_attn = config['am']['encoder_attention_dropout']
+        dropout_relu = config['am']['encoder_relu_dropout']
+        d_proj = config['am']['encoder_projection_units']
+
+        self.d_model = d_model
+
+        self.sy_emb = nn.Embedding(nb_ling_sy, d_emb)
+        self.tone_emb = nn.Embedding(nb_ling_tone, d_emb)
+        self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb)
+        self.ws_emb = nn.Embedding(nb_ling_ws, d_emb)
+
+        position_enc = SinusoidalPositionEncoder(max_len, d_emb)
+
+        self.ling_enc = SelfAttentionEncoder(nb_layers, d_emb, d_model,
+                                             nb_heads, d_head, d_inner,
+                                             dropout, dropout_attn,
+                                             dropout_relu, position_enc)
+
+        self.ling_proj = nn.Linear(d_model, d_proj, bias=False)
+
+    def forward(self, inputs_ling, masks=None, return_attns=False):
+        # Parse inputs_ling_seq
+        inputs_sy = inputs_ling[:, :, 0]
+        inputs_tone = inputs_ling[:, :, 1]
+        inputs_syllable_flag = inputs_ling[:, :, 2]
+        inputs_ws = inputs_ling[:, :, 3]
+
+        # Lookup table
+        sy_embedding = self.sy_emb(inputs_sy)
+        tone_embedding = self.tone_emb(inputs_tone)
+        syllable_flag_embedding = self.syllable_flag_emb(inputs_syllable_flag)
+        ws_embedding = self.ws_emb(inputs_ws)
+
+        ling_embedding = sy_embedding + tone_embedding + syllable_flag_embedding + ws_embedding
+
+        enc_output, enc_slf_attn_list = self.ling_enc(ling_embedding, masks,
+                                                      return_attns)
+
+        enc_output = self.ling_proj(enc_output)
+
+        return enc_output, enc_slf_attn_list
+
+
+class VarianceAdaptor(nn.Module):
+
+    def __init__(self, config):
+        super(VarianceAdaptor, self).__init__()
+
+        input_dim = config['am']['encoder_projection_units'] + config['am'][
+            'emotion_units'] + config['am']['speaker_units']
+        filter_size = config['am']['predictor_filter_size']
+        fsmn_num_layers = config['am']['predictor_fsmn_num_layers']
+        num_memory_units = config['am']['predictor_num_memory_units']
+        ffn_inner_dim = config['am']['predictor_ffn_inner_dim']
+        dropout = config['am']['predictor_dropout']
+        shift = config['am']['predictor_shift']
+        lstm_units = config['am']['predictor_lstm_units']
+
+        dur_pred_prenet_units = config['am']['dur_pred_prenet_units']
+        dur_pred_lstm_units = config['am']['dur_pred_lstm_units']
+
+        self.pitch_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size,
+                                                      fsmn_num_layers,
+                                                      num_memory_units,
+                                                      ffn_inner_dim, dropout,
+                                                      shift, lstm_units)
+        self.energy_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size,
+                                                       fsmn_num_layers,
+                                                       num_memory_units,
+                                                       ffn_inner_dim, dropout,
+                                                       shift, lstm_units)
+        self.duration_predictor = VarRnnARPredictor(input_dim,
+                                                    dur_pred_prenet_units,
+                                                    dur_pred_lstm_units)
+
+        self.length_regulator = LengthRegulator(
+            config['am']['outputs_per_step'])
+        self.dur_position_encoder = DurSinusoidalPositionEncoder(
+            config['am']['encoder_projection_units'],
+            config['am']['outputs_per_step'])
+
+        self.pitch_emb = nn.Conv1d(
+            1,
+            config['am']['encoder_projection_units'],
+            kernel_size=9,
+            padding=4)
+        self.energy_emb = nn.Conv1d(
+            1,
+            config['am']['encoder_projection_units'],
+            kernel_size=9,
+            padding=4)
+
+    def forward(self,
+                inputs_text_embedding,
+                inputs_emo_embedding,
+                inputs_spk_embedding,
+                masks=None,
+                output_masks=None,
+                duration_targets=None,
+                pitch_targets=None,
+                energy_targets=None):
+
+        batch_size = inputs_text_embedding.size(0)
+
+        variance_predictor_inputs = torch.cat([
+            inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding
+        ], dim=-1)  # yapf:disable
+
+        pitch_predictions = self.pitch_predictor(variance_predictor_inputs,
+                                                 masks)
+        energy_predictions = self.energy_predictor(variance_predictor_inputs,
+                                                   masks)
+
+        if pitch_targets is not None:
+            pitch_embeddings = self.pitch_emb(
+                pitch_targets.unsqueeze(1)).transpose(1, 2)
+        else:
+            pitch_embeddings = self.pitch_emb(
+                pitch_predictions.unsqueeze(1)).transpose(1, 2)
+
+        if energy_targets is not None:
+            energy_embeddings = self.energy_emb(
+                energy_targets.unsqueeze(1)).transpose(1, 2)
+        else:
+            energy_embeddings = self.energy_emb(
+                energy_predictions.unsqueeze(1)).transpose(1, 2)
+
+        inputs_text_embedding_aug = inputs_text_embedding + pitch_embeddings + energy_embeddings
+        duration_predictor_cond = torch.cat([
+            inputs_text_embedding_aug, inputs_spk_embedding,
+            inputs_emo_embedding
+        ], dim=-1)  # yapf:disable
+        if duration_targets is not None:
+            duration_predictor_go_frame = torch.zeros(batch_size, 1).to(
+                inputs_text_embedding.device)
+            duration_predictor_input = torch.cat([
+                duration_predictor_go_frame, duration_targets[:, :-1].float()
+            ], dim=-1)  # yapf:disable
+            duration_predictor_input = torch.log(duration_predictor_input + 1)
+            log_duration_predictions, _ = self.duration_predictor(
+                duration_predictor_input.unsqueeze(-1),
+                duration_predictor_cond,
+                masks=masks)
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+        else:
+            log_duration_predictions = self.duration_predictor.infer(
+                duration_predictor_cond, masks=masks)
+            duration_predictions = torch.exp(log_duration_predictions) - 1
+
+        if duration_targets is not None:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug,
+                duration_targets,
+                masks=output_masks)
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_targets, masks=output_masks)
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_targets, masks=output_masks)
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_targets, masks=output_masks)
+
+        else:
+            LR_text_outputs, LR_length_rounded = self.length_regulator(
+                inputs_text_embedding_aug,
+                duration_predictions,
+                masks=output_masks)
+            LR_position_embeddings = self.dur_position_encoder(
+                duration_predictions, masks=output_masks)
+            LR_emo_outputs, _ = self.length_regulator(
+                inputs_emo_embedding, duration_predictions, masks=output_masks)
+            LR_spk_outputs, _ = self.length_regulator(
+                inputs_spk_embedding, duration_predictions, masks=output_masks)
+
+        LR_text_outputs = LR_text_outputs + LR_position_embeddings
+
+        return (LR_text_outputs, LR_emo_outputs, LR_spk_outputs,
+                LR_length_rounded, log_duration_predictions, pitch_predictions,
+                energy_predictions)
+
+
+class MelPNCADecoder(nn.Module):
+
+    def __init__(self, config):
+        super(MelPNCADecoder, self).__init__()
+
+        prenet_units = config['am']['decoder_prenet_units']
+        nb_layers = config['am']['decoder_num_layers']
+        nb_heads = config['am']['decoder_num_heads']
+        d_model = config['am']['decoder_num_units']
+        d_head = d_model // nb_heads
+        d_inner = config['am']['decoder_ffn_inner_dim']
+        dropout = config['am']['decoder_dropout']
+        dropout_attn = config['am']['decoder_attention_dropout']
+        dropout_relu = config['am']['decoder_relu_dropout']
+        outputs_per_step = config['am']['outputs_per_step']
+
+        d_mem = config['am'][
+            'encoder_projection_units'] * outputs_per_step + config['am'][
+                'emotion_units'] + config['am']['speaker_units']
+        d_mel = config['am']['num_mels']
+
+        self.d_mel = d_mel
+        self.r = outputs_per_step
+        self.nb_layers = nb_layers
+
+        self.mel_dec = HybridAttentionDecoder(d_mel, prenet_units, nb_layers,
+                                              d_model, d_mem, nb_heads, d_head,
+                                              d_inner, dropout, dropout_attn,
+                                              dropout_relu,
+                                              d_mel * outputs_per_step)
+
+    def forward(self,
+                memory,
+                x_band_width,
+                h_band_width,
+                target=None,
+                mask=None,
+                return_attns=False):
+        batch_size = memory.size(0)
+        go_frame = torch.zeros((batch_size, 1, self.d_mel)).to(memory.device)
+
+        if target is not None:
+            self.mel_dec.reset_state()
+            input = target[:, self.r - 1::self.r, :]
+            input = torch.cat([go_frame, input], dim=1)[:, :-1, :]
+            dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list = self.mel_dec(
+                input,
+                memory,
+                x_band_width,
+                h_band_width,
+                mask=mask,
+                return_attns=return_attns)
+
+        else:
+            dec_output = []
+            dec_pnca_attn_x_list = [[] for _ in range(self.nb_layers)]
+            dec_pnca_attn_h_list = [[] for _ in range(self.nb_layers)]
+            self.mel_dec.reset_state()
+            input = go_frame
+            for step in range(memory.size(1)):
+                dec_output_step, dec_pnca_attn_x_step, dec_pnca_attn_h_step = self.mel_dec.infer(
+                    step,
+                    input,
+                    memory,
+                    x_band_width,
+                    h_band_width,
+                    mask=mask,
+                    return_attns=return_attns)
+                input = dec_output_step[:, :, -self.d_mel:]
+
+                dec_output.append(dec_output_step)
+                for layer_id, (pnca_x_attn, pnca_h_attn) in enumerate(
+                        zip(dec_pnca_attn_x_step, dec_pnca_attn_h_step)):
+                    left = memory.size(1) - pnca_x_attn.size(-1)
+                    if (left > 0):
+                        padding = torch.zeros(
+                            (pnca_x_attn.size(0), 1, left)).to(pnca_x_attn)
+                        pnca_x_attn = torch.cat([pnca_x_attn, padding], dim=-1)
+                    dec_pnca_attn_x_list[layer_id].append(pnca_x_attn)
+                    dec_pnca_attn_h_list[layer_id].append(pnca_h_attn)
+
+            dec_output = torch.cat(dec_output, dim=1)
+            for layer_id in range(self.nb_layers):
+                dec_pnca_attn_x_list[layer_id] = torch.cat(
+                    dec_pnca_attn_x_list[layer_id], dim=1)
+                dec_pnca_attn_h_list[layer_id] = torch.cat(
+                    dec_pnca_attn_h_list[layer_id], dim=1)
+
+        return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list
+
+
+class PostNet(nn.Module):
+
+    def __init__(self, config):
+        super(PostNet, self).__init__()
+
+        self.filter_size = config['am']['postnet_filter_size']
+        self.fsmn_num_layers = config['am']['postnet_fsmn_num_layers']
+        self.num_memory_units = config['am']['postnet_num_memory_units']
+        self.ffn_inner_dim = config['am']['postnet_ffn_inner_dim']
+        self.dropout = config['am']['postnet_dropout']
+        self.shift = config['am']['postnet_shift']
+        self.lstm_units = config['am']['postnet_lstm_units']
+        self.num_mels = config['am']['num_mels']
+
+        self.fsmn = FsmnEncoderV2(self.filter_size, self.fsmn_num_layers,
+                                  self.num_mels, self.num_memory_units,
+                                  self.ffn_inner_dim, self.dropout, self.shift)
+        self.lstm = nn.LSTM(
+            self.num_memory_units,
+            self.lstm_units,
+            num_layers=1,
+            batch_first=True)
+        self.fc = nn.Linear(self.lstm_units, self.num_mels)
+
+    def forward(self, x, mask=None):
+        postnet_fsmn_output = self.fsmn(x, mask)
+        # The input can also be a packed variable length sequence,
+        # here we just omit it for simpliciy due to the mask and uni-directional lstm.
+        postnet_lstm_output, _ = self.lstm(postnet_fsmn_output)
+        mel_residual_output = self.fc(postnet_lstm_output)
+
+        return mel_residual_output
+
+
+def mel_recon_loss_fn(output_lengths,
+                      mel_targets,
+                      dec_outputs,
+                      postnet_outputs=None):
+    mae_loss = nn.L1Loss(reduction='none')
+
+    output_masks = get_mask_from_lengths(
+        output_lengths, max_len=mel_targets.size(1))
+    output_masks = ~output_masks
+    valid_outputs = output_masks.sum()
+
+    mel_loss_ = torch.sum(
+        mae_loss(mel_targets, dec_outputs) * output_masks.unsqueeze(-1)) / (
+            valid_outputs * mel_targets.size(-1))
+
+    if postnet_outputs is not None:
+        mel_loss = torch.sum(
+            mae_loss(mel_targets, postnet_outputs)
+            * output_masks.unsqueeze(-1)) / (
+                valid_outputs * mel_targets.size(-1))
+    else:
+        mel_loss = 0.0
+
+    return mel_loss_, mel_loss
+
+
+def prosody_recon_loss_fn(input_lengths, duration_targets, pitch_targets,
+                          energy_targets, log_duration_predictions,
+                          pitch_predictions, energy_predictions):
+    mae_loss = nn.L1Loss(reduction='none')
+
+    input_masks = get_mask_from_lengths(
+        input_lengths, max_len=duration_targets.size(1))
+    input_masks = ~input_masks
+    valid_inputs = input_masks.sum()
+
+    dur_loss = torch.sum(
+        mae_loss(
+            torch.log(duration_targets.float() + 1), log_duration_predictions)
+        * input_masks) / valid_inputs
+    pitch_loss = torch.sum(
+        mae_loss(pitch_targets, pitch_predictions)
+        * input_masks) / valid_inputs
+    energy_loss = torch.sum(
+        mae_loss(energy_targets, energy_predictions)
+        * input_masks) / valid_inputs
+
+    return dur_loss, pitch_loss, energy_loss
+
+
+class KanTtsSAMBERT(nn.Module):
+
+    def __init__(self, config, ling_unit_size):
+        super(KanTtsSAMBERT, self).__init__()
+
+        self.text_encoder = TextFftEncoder(config, ling_unit_size)
+        self.spk_tokenizer = nn.Embedding(ling_unit_size['speaker'],
+                                          config['am']['speaker_units'])
+        self.emo_tokenizer = nn.Embedding(ling_unit_size['emotion'],
+                                          config['am']['emotion_units'])
+        self.variance_adaptor = VarianceAdaptor(config)
+        self.mel_decoder = MelPNCADecoder(config)
+        self.mel_postnet = PostNet(config)
+
+    def get_lfr_mask_from_lengths(self, lengths, max_len):
+        batch_size = lengths.size(0)
+        # padding according to the outputs_per_step
+        padded_lr_lengths = torch.zeros_like(lengths)
+        for i in range(batch_size):
+            len_item = int(lengths[i].item())
+            padding = self.mel_decoder.r - len_item % self.mel_decoder.r
+            if (padding < self.mel_decoder.r):
+                padded_lr_lengths[i] = (len_item
+                                        + padding) // self.mel_decoder.r
+            else:
+                padded_lr_lengths[i] = len_item // self.mel_decoder.r
+
+        return get_mask_from_lengths(
+            padded_lr_lengths, max_len=max_len // self.mel_decoder.r)
+
+    def forward(self,
+                inputs_ling,
+                inputs_emotion,
+                inputs_speaker,
+                input_lengths,
+                output_lengths=None,
+                mel_targets=None,
+                duration_targets=None,
+                pitch_targets=None,
+                energy_targets=None):
+
+        batch_size = inputs_ling.size(0)
+
+        input_masks = get_mask_from_lengths(
+            input_lengths, max_len=inputs_ling.size(1))
+
+        text_hid, enc_sla_attn_lst = self.text_encoder(
+            inputs_ling, input_masks, return_attns=True)
+
+        emo_hid = self.emo_tokenizer(inputs_emotion)
+        spk_hid = self.spk_tokenizer(inputs_speaker)
+
+        if output_lengths is not None:
+            output_masks = get_mask_from_lengths(
+                output_lengths, max_len=mel_targets.size(1))
+        else:
+            output_masks = None
+
+        (LR_text_outputs, LR_emo_outputs, LR_spk_outputs, LR_length_rounded,
+         log_duration_predictions, pitch_predictions,
+         energy_predictions) = self.variance_adaptor(
+             text_hid,
+             emo_hid,
+             spk_hid,
+             masks=input_masks,
+             output_masks=output_masks,
+             duration_targets=duration_targets,
+             pitch_targets=pitch_targets,
+             energy_targets=energy_targets)
+
+        if output_lengths is not None:
+            lfr_masks = self.get_lfr_mask_from_lengths(
+                output_lengths, max_len=LR_text_outputs.size(1))
+        else:
+            output_masks = get_mask_from_lengths(
+                LR_length_rounded, max_len=LR_text_outputs.size(1))
+            lfr_masks = None
+
+        # LFR with the factor of outputs_per_step
+        LFR_text_inputs = LR_text_outputs.contiguous().view(
+            batch_size, -1, self.mel_decoder.r * text_hid.shape[-1])
+        LFR_emo_inputs = LR_emo_outputs.contiguous().view(
+            batch_size, -1,
+            self.mel_decoder.r * emo_hid.shape[-1])[:, :, :emo_hid.shape[-1]]
+        LFR_spk_inputs = LR_spk_outputs.contiguous().view(
+            batch_size, -1,
+            self.mel_decoder.r * spk_hid.shape[-1])[:, :, :spk_hid.shape[-1]]
+
+        memory = torch.cat([LFR_text_inputs, LFR_spk_inputs, LFR_emo_inputs],
+                           dim=-1)
+
+        if duration_targets is not None:
+            x_band_width = int(
+                duration_targets.float().masked_fill(input_masks, 0).max()
+                / self.mel_decoder.r + 0.5)
+            h_band_width = x_band_width
+        else:
+            x_band_width = int((torch.exp(log_duration_predictions) - 1).max()
+                               / self.mel_decoder.r + 0.5)
+            h_band_width = x_band_width
+
+        dec_outputs, pnca_x_attn_lst, pnca_h_attn_lst = self.mel_decoder(
+            memory,
+            x_band_width,
+            h_band_width,
+            target=mel_targets,
+            mask=lfr_masks,
+            return_attns=True)
+
+        # De-LFR with the factor of outputs_per_step
+        dec_outputs = dec_outputs.contiguous().view(batch_size, -1,
+                                                    self.mel_decoder.d_mel)
+
+        if output_masks is not None:
+            dec_outputs = dec_outputs.masked_fill(
+                output_masks.unsqueeze(-1), 0)
+
+        postnet_outputs = self.mel_postnet(dec_outputs,
+                                           output_masks) + dec_outputs
+        if output_masks is not None:
+            postnet_outputs = postnet_outputs.masked_fill(
+                output_masks.unsqueeze(-1), 0)
+
+        res = {
+            'x_band_width': x_band_width,
+            'h_band_width': h_band_width,
+            'enc_slf_attn_lst': enc_sla_attn_lst,
+            'pnca_x_attn_lst': pnca_x_attn_lst,
+            'pnca_h_attn_lst': pnca_h_attn_lst,
+            'dec_outputs': dec_outputs,
+            'postnet_outputs': postnet_outputs,
+            'LR_length_rounded': LR_length_rounded,
+            'log_duration_predictions': log_duration_predictions,
+            'pitch_predictions': pitch_predictions,
+            'energy_predictions': energy_predictions
+        }
+
+        res['LR_text_outputs'] = LR_text_outputs
+        res['LR_emo_outputs'] = LR_emo_outputs
+        res['LR_spk_outputs'] = LR_spk_outputs
+
+        return res
--- a/modelscope/models/audio/tts/models/models/sambert/positions.py
+++ b/modelscope/models/audio/tts/models/models/sambert/positions.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SinusoidalPositionEncoder(nn.Module):
+
+    def __init__(self, max_len, depth):
+        super(SinusoidalPositionEncoder, self).__init__()
+
+        self.max_len = max_len
+        self.depth = depth
+        self.position_enc = nn.Parameter(
+            self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0),
+            requires_grad=False)
+
+    def forward(self, input):
+        bz_in, len_in, _ = input.size()
+        if len_in > self.max_len:
+            self.max_len = len_in
+            self.position_enc.data = self.get_sinusoid_encoding_table(
+                self.max_len, self.depth).unsqueeze(0).to(input.device)
+
+        output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1)
+
+        return output
+
+    @staticmethod
+    def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
+        """ Sinusoid position encoding table """
+
+        def cal_angle(position, hid_idx):
+            return position / np.power(10000, hid_idx / float(d_hid / 2 - 1))
+
+        def get_posi_angle_vec(position):
+            return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)]
+
+        scaled_time_table = np.array(
+            [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)])
+
+        sinusoid_table = np.zeros((n_position, d_hid))
+        sinusoid_table[:, :d_hid // 2] = np.sin(scaled_time_table)
+        sinusoid_table[:, d_hid // 2:] = np.cos(scaled_time_table)
+
+        if padding_idx is not None:
+            # zero vector for padding dimension
+            sinusoid_table[padding_idx] = 0.0
+
+        return torch.FloatTensor(sinusoid_table)
+
+
+class DurSinusoidalPositionEncoder(nn.Module):
+
+    def __init__(self, depth, outputs_per_step):
+        super(DurSinusoidalPositionEncoder, self).__init__()
+
+        self.depth = depth
+        self.outputs_per_step = outputs_per_step
+
+        inv_timescales = [
+            np.power(10000, 2 * (hid_idx // 2) / depth)
+            for hid_idx in range(depth)
+        ]
+        self.inv_timescales = nn.Parameter(
+            torch.FloatTensor(inv_timescales), requires_grad=False)
+
+    def forward(self, durations, masks=None):
+        reps = (durations + 0.5).long()
+        output_lens = reps.sum(dim=1)
+        max_len = output_lens.max()
+        reps_cumsum = torch.cumsum(
+            F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :]
+        range_ = torch.arange(max_len).to(durations.device)[None, :, None]
+        mult = ((reps_cumsum[:, :, :-1] <= range_)
+                & (reps_cumsum[:, :, 1:] > range_))  # yapf:disable
+        mult = mult.float()
+        offsets = torch.matmul(mult,
+                               reps_cumsum[:,
+                                           0, :-1].unsqueeze(-1)).squeeze(-1)
+        dur_pos = range_[:, :, 0] - offsets + 1
+
+        if masks is not None:
+            assert masks.size(1) == dur_pos.size(1)
+            dur_pos = dur_pos.masked_fill(masks, 0.0)
+
+        seq_len = dur_pos.size(1)
+        padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step
+        if (padding < self.outputs_per_step):
+            dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0)
+
+        position_embedding = dur_pos[:, :, None] / self.inv_timescales[None,
+                                                                       None, :]
+        position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :,
+                                                                      0::2])
+        position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :,
+                                                                      1::2])
+
+        return position_embedding
--- a/modelscope/models/audio/tts/models/position.py
+++ b/modelscope/models/audio/tts/models/position.py
@@ -1,174 +0,0 @@
-"""Define position encoder classes."""
-
-import abc
-import math
-
-import tensorflow as tf
-
-from .reducer import SumReducer
-
-
-class PositionEncoder(tf.keras.layers.Layer):
-    """Base class for position encoders."""
-
-    def __init__(self, reducer=None, **kwargs):
-        """Initializes the position encoder.
-        Args:
-          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
-            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
-          **kwargs: Additional layer keyword arguments.
-        """
-        super(PositionEncoder, self).__init__(**kwargs)
-        if reducer is None:
-            reducer = SumReducer(dtype=kwargs.get('dtype'))
-        self.reducer = reducer
-
-    def call(self, inputs, position=None):  # pylint: disable=arguments-differ
-        """Add position encodings to :obj:`inputs`.
-        Args:
-          inputs: The inputs to encode.
-          position: The single position to encode, to use when this layer is called
-            step by step.
-        Returns:
-          A ``tf.Tensor`` whose shape depends on the configured ``reducer``.
-        """
-        batch_size = tf.shape(inputs)[0]
-        timesteps = tf.shape(inputs)[1]
-        input_dim = inputs.shape[-1].value
-        positions = tf.range(timesteps) + 1 if position is None else [position]
-        position_encoding = self._encode([positions], input_dim)
-        position_encoding = tf.tile(position_encoding, [batch_size, 1, 1])
-        return self.reducer([inputs, position_encoding])
-
-    @abc.abstractmethod
-    def _encode(self, positions, depth):
-        """Creates position encodings.
-        Args:
-          positions: The positions to encode of shape :math:`[B, ...]`.
-          depth: The encoding depth :math:`D`.
-        Returns:
-          A ``tf.Tensor`` of shape :math:`[B, ..., D]`.
-        """
-        raise NotImplementedError()
-
-
-class PositionEmbedder(PositionEncoder):
-    """Encodes position with a lookup table."""
-
-    def __init__(self, maximum_position=128, reducer=None, **kwargs):
-        """Initializes the position encoder.
-        Args:
-          maximum_position: The maximum position to embed. Positions greater
-            than this value will be set to :obj:`maximum_position`.
-          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
-            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
-          **kwargs: Additional layer keyword arguments.
-        """
-        super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs)
-        self.maximum_position = maximum_position
-        self.embedding = None
-
-    def build(self, input_shape):
-        shape = [self.maximum_position + 1, input_shape[-1]]
-        self.embedding = self.add_weight('position_embedding', shape)
-        super(PositionEmbedder, self).build(input_shape)
-
-    def _encode(self, positions, depth):
-        positions = tf.minimum(positions, self.maximum_position)
-        return tf.nn.embedding_lookup(self.embedding, positions)
-
-
-class SinusoidalPositionEncoder(PositionEncoder):
-    """Encodes positions with sine waves as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def _encode(self, positions, depth):
-        if depth % 2 != 0:
-            raise ValueError(
-                'SinusoidalPositionEncoder expects the depth to be divisble '
-                'by 2 but got %d' % depth)
-
-        batch_size = tf.shape(positions)[0]
-        positions = tf.cast(positions, tf.float32)
-
-        log_timescale_increment = math.log(10000) / (depth / 2 - 1)
-        inv_timescales = tf.exp(
-            tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment)
-        inv_timescales = tf.reshape(
-            tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2])
-        scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims(
-            inv_timescales, 1)
-        encoding = tf.concat(
-            [tf.sin(scaled_time), tf.cos(scaled_time)], axis=2)
-        return tf.cast(encoding, self.dtype)
-
-
-class SinusodalPositionalEncoding(tf.keras.layers.Layer):
-
-    def __init__(self, name='SinusodalPositionalEncoding'):
-        super(SinusodalPositionalEncoding, self).__init__(name=name)
-
-    @staticmethod
-    def positional_encoding(len, dim, step=1.):
-        """
-        :param len: int scalar
-        :param dim: int scalar
-        :param step:
-        :return: position embedding
-        """
-        pos_mat = tf.tile(
-            tf.expand_dims(
-                tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32)
-                * step,
-                axis=-1), [1, dim])
-        dim_mat = tf.tile(
-            tf.expand_dims(
-                tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
-                axis=0), [len, 1])
-        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
-        pos_encoding = tf.where(  # [time, dims]
-            tf.math.equal(tf.math.mod(dim_mat_int, 2), 0),
-            x=tf.math.sin(
-                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
-            y=tf.math.cos(pos_mat
-                          / tf.pow(10000.,
-                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
-        return pos_encoding
-
-
-class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer):
-
-    def __init__(self, name='BatchSinusodalPositionalEncoding'):
-        super(BatchSinusodalPositionalEncoding, self).__init__(name=name)
-
-    @staticmethod
-    def positional_encoding(batch_size, len, dim, pos_mat, step=1.):
-        """
-        :param len: int scalar
-        :param dim: int scalar
-        :param step:
-        :param pos_mat: [B, len] = [len, 1] * dim
-        :return: position embedding
-        """
-        pos_mat = tf.tile(
-            tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1),
-            [1, 1, dim])  # [B, len, dim]
-
-        dim_mat = tf.tile(
-            tf.expand_dims(
-                tf.expand_dims(
-                    tf.range(
-                        0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
-                    axis=0),
-                axis=0), [batch_size, len, 1])  # [B, len, dim]
-
-        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
-        pos_encoding = tf.where(  # [B, time, dims]
-            tf.math.equal(tf.mod(dim_mat_int, 2), 0),
-            x=tf.math.sin(
-                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
-            y=tf.math.cos(pos_mat
-                          / tf.pow(10000.,
-                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
-        return pos_encoding
--- a/modelscope/models/audio/tts/models/reducer.py
+++ b/modelscope/models/audio/tts/models/reducer.py
@@ -1,155 +0,0 @@
-"""Define reducers: objects that merge inputs."""
-
-import abc
-import functools
-
-import tensorflow as tf
-
-
-def pad_in_time(x, padding_length):
-    """Helper function to pad a tensor in the time dimension and retain the static depth dimension."""
-    return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
-
-
-def align_in_time(x, length):
-    """Aligns the time dimension of :obj:`x` with :obj:`length`."""
-    time_dim = tf.shape(x)[1]
-    return tf.cond(
-        tf.less(time_dim, length),
-        true_fn=lambda: pad_in_time(x, length - time_dim),
-        false_fn=lambda: x[:, :length])
-
-
-def pad_with_identity(x,
-                      sequence_length,
-                      max_sequence_length,
-                      identity_values=0,
-                      maxlen=None):
-    """Pads a tensor with identity values up to :obj:`max_sequence_length`.
-    Args:
-      x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``.
-      sequence_length: The true sequence length of :obj:`x`.
-      max_sequence_length: The sequence length up to which the tensor must contain
-        :obj:`identity values`.
-      identity_values: The identity value.
-      maxlen: Size of the output time dimension. Default is the maximum value in
-        obj:`max_sequence_length`.
-    Returns:
-      A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``.
-    """
-    if maxlen is None:
-        maxlen = tf.reduce_max(max_sequence_length)
-
-    mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype)
-    mask = tf.expand_dims(mask, axis=-1)
-    mask_combined = tf.sequence_mask(
-        max_sequence_length, maxlen=maxlen, dtype=x.dtype)
-    mask_combined = tf.expand_dims(mask_combined, axis=-1)
-
-    identity_mask = mask_combined * (1.0 - mask)
-
-    x = pad_in_time(x, maxlen - tf.shape(x)[1])
-    x = x * mask + (identity_mask * identity_values)
-
-    return x
-
-
-def pad_n_with_identity(inputs, sequence_lengths, identity_values=0):
-    """Pads each input tensors with identity values up to
-    ``max(sequence_lengths)`` for each batch.
-    Args:
-      inputs: A list of ``tf.Tensor``.
-      sequence_lengths: A list of sequence length.
-      identity_values: The identity value.
-    Returns:
-      A tuple ``(padded, max_sequence_length)`` which are respectively a list of
-      ``tf.Tensor`` where each tensor are padded with identity and the combined
-      sequence length.
-    """
-    max_sequence_length = tf.reduce_max(sequence_lengths, axis=0)
-    maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs])
-    padded = [
-        pad_with_identity(
-            x,
-            length,
-            max_sequence_length,
-            identity_values=identity_values,
-            maxlen=maxlen) for x, length in zip(inputs, sequence_lengths)
-    ]
-    return padded, max_sequence_length
-
-
-class Reducer(tf.keras.layers.Layer):
-    """Base class for reducers."""
-
-    def zip_and_reduce(self, x, y):
-        """Zips the :obj:`x` with :obj:`y` structures together and reduces all
-        elements. If the structures are nested, they will be flattened first.
-        Args:
-          x: The first structure.
-          y: The second structure.
-        Returns:
-          The same structure as :obj:`x` and :obj:`y` where each element from
-          :obj:`x` is reduced with the correspond element from :obj:`y`.
-        Raises:
-          ValueError: if the two structures are not the same.
-        """
-        tf.nest.assert_same_structure(x, y)
-        x_flat = tf.nest.flatten(x)
-        y_flat = tf.nest.flatten(y)
-        reduced = list(map(self, zip(x_flat, y_flat)))
-        return tf.nest.pack_sequence_as(x, reduced)
-
-    def call(self, inputs, sequence_length=None):  # pylint: disable=arguments-differ
-        """Reduces all input elements.
-        Args:
-          inputs: A list of ``tf.Tensor``.
-          sequence_length: The length of each input, if reducing sequences.
-        Returns:
-          If :obj:`sequence_length` is set, a tuple
-          ``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor``
-          only.
-        """
-        if sequence_length is None:
-            return self.reduce(inputs)
-        else:
-            return self.reduce_sequence(
-                inputs, sequence_lengths=sequence_length)
-
-    @abc.abstractmethod
-    def reduce(self, inputs):
-        """See :meth:`opennmt.layers.Reducer.__call__`."""
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def reduce_sequence(self, inputs, sequence_lengths):
-        """See :meth:`opennmt.layers.Reducer.__call__`."""
-        raise NotImplementedError()
-
-
-class SumReducer(Reducer):
-    """A reducer that sums the inputs."""
-
-    def reduce(self, inputs):
-        if len(inputs) == 1:
-            return inputs[0]
-        if len(inputs) == 2:
-            return inputs[0] + inputs[1]
-        return tf.add_n(inputs)
-
-    def reduce_sequence(self, inputs, sequence_lengths):
-        padded, combined_length = pad_n_with_identity(
-            inputs, sequence_lengths, identity_values=0)
-        return self.reduce(padded), combined_length
-
-
-class MultiplyReducer(Reducer):
-    """A reducer that multiplies the inputs."""
-
-    def reduce(self, inputs):
-        return functools.reduce(lambda a, x: a * x, inputs)
-
-    def reduce_sequence(self, inputs, sequence_lengths):
-        padded, combined_length = pad_n_with_identity(
-            inputs, sequence_lengths, identity_values=1)
-        return self.reduce(padded), combined_length
--- a/modelscope/models/audio/tts/models/rnn_wrappers.py
+++ b/modelscope/models/audio/tts/models/rnn_wrappers.py
@@ -1,237 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.ops import rnn_cell_impl
-
-from .am_models import prenet
-
-
-class VarPredictorCell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(VarPredictorCell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='var_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class DurPredictorCell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(DurPredictorCell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='dur_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-        new_super_cell_out = tf.nn.relu(new_super_cell_out)
-        #    new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1)
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class DurPredictorCECell(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units,
-                 max_dur, dur_embedding_dim):
-        super(DurPredictorCECell, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-        self._max_dur = max_dur
-        self._dur_embedding_dim = dur_embedding_dim
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._max_dur
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        """Run the Tacotron2 super decoder cell."""
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = tf.squeeze(
-            tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1)  # [N]
-        prenet_input = tf.one_hot(
-            prenet_input, self._max_dur, on_value=1.0, off_value=0.0,
-            axis=-1)  # [N, 120]
-        prenet_input = tf.layers.dense(
-            prenet_input, units=self._dur_embedding_dim)
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='dur_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._max_dur)  # [N, 120]
-        new_super_cell_out = tf.nn.softmax(new_super_cell_out)  # [N, 120]
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
-
-
-class VarPredictorCell2(tf.contrib.rnn.RNNCell):
-    """Wrapper wrapper knock knock."""
-
-    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
-        super(VarPredictorCell2, self).__init__()
-        self._var_predictor_cell = var_predictor_cell
-        self._is_training = is_training
-        self._dim = dim
-        self._prenet_units = prenet_units
-
-    @property
-    def state_size(self):
-        return tuple([self.output_size, self._var_predictor_cell.state_size])
-
-    @property
-    def output_size(self):
-        return self._dim
-
-    def zero_state(self, batch_size, dtype):
-        return tuple([
-            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
-                                              dtype),
-            self._var_predictor_cell.zero_state(batch_size, dtype)
-        ])
-
-    def call(self, inputs, state):
-        '''Run the Tacotron2 super decoder cell.'''
-        super_cell_out, decoder_state = state
-
-        # split
-        prenet_input = inputs[:, 0:self._dim]
-        encoder_output = inputs[:, self._dim:]
-
-        # prenet and concat
-        prenet_output = prenet(
-            prenet_input,
-            self._prenet_units,
-            self._is_training,
-            scope='var_prenet')
-        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
-
-        # decoder LSTM/GRU
-        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
-            decoder_input, decoder_state)
-
-        # projection
-        new_super_cell_out = tf.layers.dense(
-            new_super_cell_out, units=self._dim)
-
-        # split and relu
-        new_super_cell_out = tf.concat([
-            tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:]
-        ], axis=-1)  # yapf:disable
-
-        new_states = tuple([new_super_cell_out, new_decoder_state])
-
-        return new_super_cell_out, new_states
--- a/modelscope/models/audio/tts/models/robutrans.py
+++ b/modelscope/models/audio/tts/models/robutrans.py
@@ -1,760 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.ops.ragged.ragged_util import repeat
-
-from .fsmn_encoder import FsmnEncoderV2
-from .position import BatchSinusodalPositionalEncoding
-from .self_attention_decoder import SelfAttentionDecoder
-from .self_attention_encoder import SelfAttentionEncoder
-
-
-class RobuTrans():
-
-    def __init__(self, hparams):
-        self._hparams = hparams
-
-    def initialize(self,
-                   inputs,
-                   inputs_emotion,
-                   inputs_speaker,
-                   input_lengths,
-                   output_lengths=None,
-                   mel_targets=None,
-                   durations=None,
-                   pitch_contours=None,
-                   uv_masks=None,
-                   pitch_scales=None,
-                   duration_scales=None,
-                   energy_contours=None,
-                   energy_scales=None):
-        """Initializes the model for inference.
-
-        Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields.
-
-        Args:
-          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
-            steps in the input time series, and values are character IDs
-          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
-            of each sequence in inputs.
-          output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
-            of each sequence in outputs.
-          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
-            of steps in the output time series, M is num_mels, and values are entries in the mel
-            spectrogram. Only needed for training.
-        """
-        from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell
-        from tensorflow.contrib.seq2seq import BasicDecoder
-
-        with tf.variable_scope('inference') as _:
-            is_training = mel_targets is not None
-            batch_size = tf.shape(inputs)[0]
-            hp = self._hparams
-
-            input_mask = None
-            if input_lengths is not None and is_training:
-                input_mask = tf.sequence_mask(
-                    input_lengths, tf.shape(inputs)[1], dtype=tf.float32)
-
-            if input_mask is not None:
-                inputs = inputs * tf.expand_dims(input_mask, -1)
-
-            # speaker embedding
-            embedded_inputs_speaker = tf.layers.dense(
-                inputs_speaker,
-                32,
-                activation=None,
-                use_bias=False,
-                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
-
-            # emotion embedding
-            embedded_inputs_emotion = tf.layers.dense(
-                inputs_emotion,
-                32,
-                activation=None,
-                use_bias=False,
-                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
-
-            # symbol embedding
-            with tf.variable_scope('Embedding'):
-                embedded_inputs = tf.layers.dense(
-                    inputs,
-                    hp.embedding_dim,
-                    activation=None,
-                    use_bias=False,
-                    kernel_initializer=tf.truncated_normal_initializer(
-                        stddev=0.5))
-
-            # Encoder
-            with tf.variable_scope('Encoder'):
-                Encoder = SelfAttentionEncoder(
-                    num_layers=hp.encoder_num_layers,
-                    num_units=hp.encoder_num_units,
-                    num_heads=hp.encoder_num_heads,
-                    ffn_inner_dim=hp.encoder_ffn_inner_dim,
-                    dropout=hp.encoder_dropout,
-                    attention_dropout=hp.encoder_attention_dropout,
-                    relu_dropout=hp.encoder_relu_dropout)
-                encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode(
-                    embedded_inputs,
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                encoder_outputs = tf.layers.dense(
-                    encoder_outputs,
-                    hp.encoder_projection_units,
-                    activation=None,
-                    use_bias=False,
-                    kernel_initializer=tf.truncated_normal_initializer(
-                        stddev=0.5))
-
-            # pitch and energy
-            var_inputs = tf.concat([
-                encoder_outputs, embedded_inputs_speaker,
-                embedded_inputs_emotion
-            ], 2)
-            if input_mask is not None:
-                var_inputs = var_inputs * tf.expand_dims(input_mask, -1)
-
-            with tf.variable_scope('Pitch_Predictor'):
-                Pitch_Predictor_FSMN = FsmnEncoderV2(
-                    filter_size=hp.predictor_filter_size,
-                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
-                    dnn_num_layers=hp.predictor_dnn_num_layers,
-                    num_memory_units=hp.predictor_num_memory_units,
-                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
-                    dropout=hp.predictor_dropout,
-                    shift=hp.predictor_shift,
-                    position_encoder=None)
-                pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode(
-                    tf.concat([
-                        encoder_outputs, embedded_inputs_speaker,
-                        embedded_inputs_emotion
-                    ], 2),
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    pitch_contour_outputs,
-                    sequence_length=input_lengths,
-                    dtype=tf.float32)
-                pitch_contour_outputs = tf.concat(
-                    pitch_contour_outputs, axis=-1)
-                pitch_contour_outputs = tf.layers.dense(
-                    pitch_contour_outputs, units=1)  # [N, T_in, 1]
-                pitch_contour_outputs = tf.squeeze(
-                    pitch_contour_outputs, axis=2)  # [N, T_in]
-
-            with tf.variable_scope('Energy_Predictor'):
-                Energy_Predictor_FSMN = FsmnEncoderV2(
-                    filter_size=hp.predictor_filter_size,
-                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
-                    dnn_num_layers=hp.predictor_dnn_num_layers,
-                    num_memory_units=hp.predictor_num_memory_units,
-                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
-                    dropout=hp.predictor_dropout,
-                    shift=hp.predictor_shift,
-                    position_encoder=None)
-                energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode(
-                    tf.concat([
-                        encoder_outputs, embedded_inputs_speaker,
-                        embedded_inputs_emotion
-                    ], 2),
-                    sequence_length=input_lengths,
-                    mode=is_training)
-                energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    energy_contour_outputs,
-                    sequence_length=input_lengths,
-                    dtype=tf.float32)
-                energy_contour_outputs = tf.concat(
-                    energy_contour_outputs, axis=-1)
-                energy_contour_outputs = tf.layers.dense(
-                    energy_contour_outputs, units=1)  # [N, T_in, 1]
-                energy_contour_outputs = tf.squeeze(
-                    energy_contour_outputs, axis=2)  # [N, T_in]
-
-            if is_training:
-                pitch_embeddings = tf.expand_dims(
-                    pitch_contours, axis=2)  # [N, T_in, 1]
-                pitch_embeddings = tf.layers.conv1d(
-                    pitch_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='pitch_embeddings')  # [N, T_in, 32]
-
-                energy_embeddings = tf.expand_dims(
-                    energy_contours, axis=2)  # [N, T_in, 1]
-                energy_embeddings = tf.layers.conv1d(
-                    energy_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='energy_embeddings')  # [N, T_in, 32]
-            else:
-                pitch_contour_outputs *= pitch_scales
-                pitch_embeddings = tf.expand_dims(
-                    pitch_contour_outputs, axis=2)  # [N, T_in, 1]
-                pitch_embeddings = tf.layers.conv1d(
-                    pitch_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='pitch_embeddings')  # [N, T_in, 32]
-
-                energy_contour_outputs *= energy_scales
-                energy_embeddings = tf.expand_dims(
-                    energy_contour_outputs, axis=2)  # [N, T_in, 1]
-                energy_embeddings = tf.layers.conv1d(
-                    energy_embeddings,
-                    filters=hp.encoder_projection_units,
-                    kernel_size=9,
-                    padding='same',
-                    name='energy_embeddings')  # [N, T_in, 32]
-
-            encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings
-
-            # duration
-            dur_inputs = tf.concat([
-                encoder_outputs_, embedded_inputs_speaker,
-                embedded_inputs_emotion
-            ], 2)
-            if input_mask is not None:
-                dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1)
-            with tf.variable_scope('Duration_Predictor'):
-                duration_predictor_cell = MultiRNNCell([
-                    LSTMBlockCell(hp.predictor_lstm_units),
-                    LSTMBlockCell(hp.predictor_lstm_units)
-                ], state_is_tuple=True)  # yapf:disable
-                from .rnn_wrappers import DurPredictorCell
-                duration_output_cell = DurPredictorCell(
-                    duration_predictor_cell, is_training, 1,
-                    hp.predictor_prenet_units)
-                duration_predictor_init_state = duration_output_cell.zero_state(
-                    batch_size=batch_size, dtype=tf.float32)
-                if is_training:
-                    from .helpers import VarTrainingHelper
-                    duration_helper = VarTrainingHelper(
-                        tf.expand_dims(
-                            tf.log(tf.cast(durations, tf.float32) + 1),
-                            axis=2), dur_inputs, 1)
-                else:
-                    from .helpers import VarTestHelper
-                    duration_helper = VarTestHelper(batch_size, dur_inputs, 1)
-                (
-                    duration_outputs, _
-                ), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode(
-                    BasicDecoder(duration_output_cell, duration_helper,
-                                 duration_predictor_init_state),
-                    maximum_iterations=1000)
-                duration_outputs = tf.squeeze(
-                    duration_outputs, axis=2)  # [N, T_in]
-                if input_mask is not None:
-                    duration_outputs = duration_outputs * input_mask
-                duration_outputs_ = tf.exp(duration_outputs) - 1
-
-            # Length Regulator
-            with tf.variable_scope('Length_Regulator'):
-                if is_training:
-                    i = tf.constant(1)
-                    # position embedding
-                    j = tf.constant(1)
-                    dur_len = tf.shape(durations)[-1]
-                    embedded_position_i = tf.range(1, durations[0, 0] + 1)
-
-                    def condition_pos(j, e):
-                        return tf.less(j, dur_len)
-
-                    def loop_body_pos(j, embedded_position_i):
-                        embedded_position_i = tf.concat([
-                            embedded_position_i,
-                            tf.range(1, durations[0, j] + 1)
-                        ], axis=0)  # yapf:disable
-                        return [j + 1, embedded_position_i]
-
-                    j, embedded_position_i = tf.while_loop(
-                        condition_pos,
-                        loop_body_pos, [j, embedded_position_i],
-                        shape_invariants=[
-                            j.get_shape(),
-                            tf.TensorShape([None])
-                        ])
-                    embedded_position = tf.reshape(embedded_position_i,
-                                                   (1, -1))
-
-                    # others
-                    LR_outputs = repeat(
-                        encoder_outputs_[0:1, :, :], durations[0, :], axis=1)
-                    embedded_outputs_speaker = repeat(
-                        embedded_inputs_speaker[0:1, :, :],
-                        durations[0, :],
-                        axis=1)
-                    embedded_outputs_emotion = repeat(
-                        embedded_inputs_emotion[0:1, :, :],
-                        durations[0, :],
-                        axis=1)
-
-                    def condition(i, pos, layer, s, e):
-                        return tf.less(i, tf.shape(mel_targets)[0])
-
-                    def loop_body(i, embedded_position, LR_outputs,
-                                  embedded_outputs_speaker,
-                                  embedded_outputs_emotion):
-                        # position embedding
-                        jj = tf.constant(1)
-                        embedded_position_i = tf.range(1, durations[i, 0] + 1)
-
-                        def condition_pos_i(j, e):
-                            return tf.less(j, dur_len)
-
-                        def loop_body_pos_i(j, embedded_position_i):
-                            embedded_position_i = tf.concat([
-                                embedded_position_i,
-                                tf.range(1, durations[i, j] + 1)
-                            ], axis=0)  # yapf:disable
-                            return [j + 1, embedded_position_i]
-
-                        jj, embedded_position_i = tf.while_loop(
-                            condition_pos_i,
-                            loop_body_pos_i, [jj, embedded_position_i],
-                            shape_invariants=[
-                                jj.get_shape(),
-                                tf.TensorShape([None])
-                            ])
-                        embedded_position = tf.concat([
-                            embedded_position,
-                            tf.reshape(embedded_position_i, (1, -1))
-                        ], 0)
-
-                        # others
-                        LR_outputs = tf.concat([
-                            LR_outputs,
-                            repeat(
-                                encoder_outputs_[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        embedded_outputs_speaker = tf.concat([
-                            embedded_outputs_speaker,
-                            repeat(
-                                embedded_inputs_speaker[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        embedded_outputs_emotion = tf.concat([
-                            embedded_outputs_emotion,
-                            repeat(
-                                embedded_inputs_emotion[i:i + 1, :, :],
-                                durations[i, :],
-                                axis=1)
-                        ], 0)
-                        return [
-                            i + 1, embedded_position, LR_outputs,
-                            embedded_outputs_speaker, embedded_outputs_emotion
-                        ]
-
-                    i, embedded_position, LR_outputs,
-                    embedded_outputs_speaker,
-                    embedded_outputs_emotion = tf.while_loop(
-                        condition,
-                        loop_body, [
-                            i, embedded_position, LR_outputs,
-                            embedded_outputs_speaker, embedded_outputs_emotion
-                        ],
-                        shape_invariants=[
-                            i.get_shape(),
-                            tf.TensorShape([None, None]),
-                            tf.TensorShape([None, None, None]),
-                            tf.TensorShape([None, None, None]),
-                            tf.TensorShape([None, None, None])
-                        ],
-                        parallel_iterations=hp.batch_size)
-
-                    ori_framenum = tf.shape(mel_targets)[1]
-                else:
-                    # position
-                    j = tf.constant(1)
-                    dur_len = tf.shape(duration_outputs_)[-1]
-                    embedded_position_i = tf.range(
-                        1,
-                        tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32)
-                        + 1)
-
-                    def condition_pos(j, e):
-                        return tf.less(j, dur_len)
-
-                    def loop_body_pos(j, embedded_position_i):
-                        embedded_position_i = tf.concat([
-                            embedded_position_i,
-                            tf.range(
-                                1,
-                                tf.cast(
-                                    tf.round(duration_outputs_)[0, j],
-                                    tf.int32) + 1)
-                        ], axis=0)  # yapf:disable
-                        return [j + 1, embedded_position_i]
-
-                    j, embedded_position_i = tf.while_loop(
-                        condition_pos,
-                        loop_body_pos, [j, embedded_position_i],
-                        shape_invariants=[
-                            j.get_shape(),
-                            tf.TensorShape([None])
-                        ])
-                    embedded_position = tf.reshape(embedded_position_i,
-                                                   (1, -1))
-                    # others
-                    duration_outputs_ *= duration_scales
-                    LR_outputs = repeat(
-                        encoder_outputs_[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    embedded_outputs_speaker = repeat(
-                        embedded_inputs_speaker[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    embedded_outputs_emotion = repeat(
-                        embedded_inputs_emotion[0:1, :, :],
-                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
-                        axis=1)
-                    ori_framenum = tf.shape(LR_outputs)[1]
-
-                    left = hp.outputs_per_step - tf.mod(
-                        ori_framenum, hp.outputs_per_step)
-                    LR_outputs = tf.cond(
-                        tf.equal(left,
-                                 hp.outputs_per_step), lambda: LR_outputs,
-                        lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]],
-                                       'CONSTANT'))
-                    embedded_outputs_speaker = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_outputs_speaker, lambda: tf.pad(
-                            embedded_outputs_speaker, [[0, 0], [0, left],
-                                                       [0, 0]], 'CONSTANT'))
-                    embedded_outputs_emotion = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_outputs_emotion, lambda: tf.pad(
-                            embedded_outputs_emotion, [[0, 0], [0, left],
-                                                       [0, 0]], 'CONSTANT'))
-                    embedded_position = tf.cond(
-                        tf.equal(left, hp.outputs_per_step),
-                        lambda: embedded_position,
-                        lambda: tf.pad(embedded_position, [[0, 0], [0, left]],
-                                       'CONSTANT'))
-
-            # Pos_Embedding
-            with tf.variable_scope('Position_Embedding'):
-                Pos_Embedding = BatchSinusodalPositionalEncoding()
-                position_embeddings = Pos_Embedding.positional_encoding(
-                    batch_size,
-                    tf.shape(LR_outputs)[1], hp.encoder_projection_units,
-                    embedded_position)
-            LR_outputs += position_embeddings
-
-            # multi-frame
-            LR_outputs = tf.reshape(LR_outputs, [
-                batch_size, -1,
-                hp.outputs_per_step * hp.encoder_projection_units
-            ])
-            embedded_outputs_speaker = tf.reshape(
-                embedded_outputs_speaker,
-                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
-            embedded_outputs_emotion = tf.reshape(
-                embedded_outputs_emotion,
-                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
-            # [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64)
-            LR_outputs = tf.concat([
-                LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion
-            ], -1)
-
-            # auto bandwidth
-            if is_training:
-                durations_mask = tf.cast(durations,
-                                         tf.float32) * input_mask  # [N, T_in]
-            else:
-                durations_mask = duration_outputs_
-            X_band_width = tf.cast(
-                tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step),
-                tf.int32)
-            H_band_width = X_band_width
-
-            with tf.variable_scope('Decoder'):
-                Decoder = SelfAttentionDecoder(
-                    num_layers=hp.decoder_num_layers,
-                    num_units=hp.decoder_num_units,
-                    num_heads=hp.decoder_num_heads,
-                    ffn_inner_dim=hp.decoder_ffn_inner_dim,
-                    dropout=hp.decoder_dropout,
-                    attention_dropout=hp.decoder_attention_dropout,
-                    relu_dropout=hp.decoder_relu_dropout,
-                    prenet_units=hp.prenet_units,
-                    dense_units=hp.prenet_proj_units,
-                    num_mels=hp.num_mels,
-                    outputs_per_step=hp.outputs_per_step,
-                    X_band_width=X_band_width,
-                    H_band_width=H_band_width,
-                    position_encoder=None)
-                if is_training:
-                    if hp.free_run:
-                        r = hp.outputs_per_step
-                        init_decoder_input = tf.expand_dims(
-                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                            axis=1)  # [N, 1, hp.num_mels]
-                        decoder_input_lengths = tf.cast(
-                            output_lengths / r, tf.int32)
-                        decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
-                            init_decoder_input,
-                            maximum_iterations=tf.shape(LR_outputs)[1],
-                            mode=is_training,
-                            memory=LR_outputs,
-                            memory_sequence_length=decoder_input_lengths)
-                    else:
-                        r = hp.outputs_per_step
-                        decoder_input = mel_targets[:, r - 1::
-                                                    r, :]  # [N, T_out / r, hp.num_mels]
-                        init_decoder_input = tf.expand_dims(
-                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                            axis=1)  # [N, 1, hp.num_mels]
-                        decoder_input = tf.concat(
-                            [init_decoder_input, decoder_input],
-                            axis=1)  # [N, T_out / r + 1, hp.num_mels]
-                        decoder_input = decoder_input[:, :
-                                                      -1, :]  # [N, T_out / r, hp.num_mels]
-                        decoder_input_lengths = tf.cast(
-                            output_lengths / r, tf.int32)
-                        decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs(
-                            decoder_input,
-                            decoder_input_lengths,
-                            mode=is_training,
-                            memory=LR_outputs,
-                            memory_sequence_length=decoder_input_lengths)
-                else:
-                    init_decoder_input = tf.expand_dims(
-                        tf.tile([[0.0]], [batch_size, hp.num_mels]),
-                        axis=1)  # [N, 1, hp.num_mels]
-                    decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
-                        init_decoder_input,
-                        maximum_iterations=tf.shape(LR_outputs)[1],
-                        mode=is_training,
-                        memory=LR_outputs,
-                        memory_sequence_length=tf.expand_dims(
-                            tf.shape(LR_outputs)[1], axis=0))
-
-                if is_training:
-                    mel_outputs_ = tf.reshape(decoder_outputs,
-                                              [batch_size, -1, hp.num_mels])
-                else:
-                    mel_outputs_ = tf.reshape(
-                        decoder_outputs,
-                        [batch_size, -1, hp.num_mels])[:, :ori_framenum, :]
-                mel_outputs = mel_outputs_
-
-            with tf.variable_scope('Postnet'):
-                Postnet_FSMN = FsmnEncoderV2(
-                    filter_size=hp.postnet_filter_size,
-                    fsmn_num_layers=hp.postnet_fsmn_num_layers,
-                    dnn_num_layers=hp.postnet_dnn_num_layers,
-                    num_memory_units=hp.postnet_num_memory_units,
-                    ffn_inner_dim=hp.postnet_ffn_inner_dim,
-                    dropout=hp.postnet_dropout,
-                    shift=hp.postnet_shift,
-                    position_encoder=None)
-                if is_training:
-                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
-                        mel_outputs,
-                        sequence_length=output_lengths,
-                        mode=is_training)
-                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
-                        LSTMBlockCell(hp.postnet_lstm_units),
-                        postnet_fsmn_outputs,
-                        sequence_length=output_lengths,
-                        dtype=tf.float32)
-                else:
-                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
-                        mel_outputs,
-                        sequence_length=[tf.shape(mel_outputs_)[1]],
-                        mode=is_training)
-                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
-                        LSTMBlockCell(hp.postnet_lstm_units),
-                        postnet_fsmn_outputs,
-                        sequence_length=[tf.shape(mel_outputs_)[1]],
-                        dtype=tf.float32)
-
-            mel_residual_outputs = tf.layers.dense(
-                hidden_lstm_outputs, units=hp.num_mels)
-            mel_outputs += mel_residual_outputs
-
-            self.inputs = inputs
-            self.inputs_speaker = inputs_speaker
-            self.inputs_emotion = inputs_emotion
-            self.input_lengths = input_lengths
-            self.durations = durations
-            self.output_lengths = output_lengths
-            self.mel_outputs_ = mel_outputs_
-            self.mel_outputs = mel_outputs
-            self.mel_targets = mel_targets
-            self.duration_outputs = duration_outputs
-            self.duration_outputs_ = duration_outputs_
-            self.duration_scales = duration_scales
-            self.pitch_contour_outputs = pitch_contour_outputs
-            self.pitch_contours = pitch_contours
-            self.pitch_scales = pitch_scales
-            self.energy_contour_outputs = energy_contour_outputs
-            self.energy_contours = energy_contours
-            self.energy_scales = energy_scales
-            self.uv_masks_ = uv_masks
-
-            self.embedded_inputs_emotion = embedded_inputs_emotion
-            self.embedding_fsmn_outputs = embedded_inputs
-            self.encoder_outputs = encoder_outputs
-            self.encoder_outputs_ = encoder_outputs_
-            self.LR_outputs = LR_outputs
-            self.postnet_fsmn_outputs = postnet_fsmn_outputs
-
-            self.pitch_embeddings = pitch_embeddings
-            self.energy_embeddings = energy_embeddings
-
-            self.attns = attns
-            self.attention_x = attention_x
-            self.attention_h = attention_h
-            self.X_band_width = X_band_width
-            self.H_band_width = H_band_width
-
-    def add_loss(self):
-        '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
-        with tf.variable_scope('loss') as _:
-            hp = self._hparams
-            mask = tf.sequence_mask(
-                self.output_lengths,
-                tf.shape(self.mel_targets)[1],
-                dtype=tf.float32)
-            valid_outputs = tf.reduce_sum(mask)
-
-            mask_input = tf.sequence_mask(
-                self.input_lengths,
-                tf.shape(self.durations)[1],
-                dtype=tf.float32)
-            valid_inputs = tf.reduce_sum(mask_input)
-
-            # mel loss
-            if self.uv_masks_ is not None:
-                valid_outputs_mask = tf.reduce_sum(
-                    tf.expand_dims(mask, -1) * self.uv_masks_)
-                self.mel_loss_ = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs_)
-                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
-                        valid_outputs_mask * hp.num_mels)
-                self.mel_loss = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs)
-                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
-                        valid_outputs_mask * hp.num_mels)
-            else:
-                self.mel_loss_ = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs_)
-                    * tf.expand_dims(mask, -1)) / (
-                        valid_outputs * hp.num_mels)
-                self.mel_loss = tf.reduce_sum(
-                    tf.abs(self.mel_targets - self.mel_outputs)
-                    * tf.expand_dims(mask, -1)) / (
-                        valid_outputs * hp.num_mels)
-
-            # duration loss
-            self.duration_loss = tf.reduce_sum(
-                tf.abs(
-                    tf.log(tf.cast(self.durations, tf.float32) + 1)
-                    - self.duration_outputs) * mask_input) / valid_inputs
-
-            # pitch contour loss
-            self.pitch_contour_loss = tf.reduce_sum(
-                tf.abs(self.pitch_contours - self.pitch_contour_outputs)
-                * mask_input) / valid_inputs
-
-            # energy contour loss
-            self.energy_contour_loss = tf.reduce_sum(
-                tf.abs(self.energy_contours - self.energy_contour_outputs)
-                * mask_input) / valid_inputs
-
-            # final loss
-            self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \
-                + self.pitch_contour_loss + self.energy_contour_loss
-
-            # guided attention loss
-            self.guided_attention_loss = tf.constant(0.0)
-            if hp.guided_attention:
-                i0 = tf.constant(0)
-                loss0 = tf.constant(0.0)
-
-                def c(i, _):
-                    return tf.less(i, tf.shape(mel_targets)[0])
-
-                def loop_body(i, loss):
-                    decoder_input_lengths = tf.cast(
-                        self.output_lengths / hp.outputs_per_step, tf.int32)
-                    input_len = decoder_input_lengths[i]
-                    output_len = decoder_input_lengths[i]
-                    input_w = tf.expand_dims(
-                        tf.range(tf.cast(input_len, dtype=tf.float32)),
-                        axis=1) / tf.cast(
-                            input_len, dtype=tf.float32)  # [T_in, 1]
-                    output_w = tf.expand_dims(
-                        tf.range(tf.cast(output_len, dtype=tf.float32)),
-                        axis=0) / tf.cast(
-                            output_len, dtype=tf.float32)  # [1, T_out]
-                    guided_attention_w = 1.0 - tf.exp(
-                        -(1 / hp.guided_attention_2g_squared)
-                        * tf.square(input_w - output_w))  # [T_in, T_out]
-                    guided_attention_w = tf.expand_dims(
-                        guided_attention_w, axis=0)  # [1, T_in, T_out]
-                    # [hp.decoder_num_heads, T_in, T_out]
-                    guided_attention_w = tf.tile(guided_attention_w,
-                                                 [hp.decoder_num_heads, 1, 1])
-                    loss_i = tf.constant(0.0)
-                    for j in range(hp.decoder_num_layers):
-                        loss_i += tf.reduce_mean(
-                            self.attention_h[j][i, :, :input_len, :output_len]
-                            * guided_attention_w)
-
-                    return [tf.add(i, 1), tf.add(loss, loss_i)]
-
-                _, loss = tf.while_loop(
-                    c,
-                    loop_body,
-                    loop_vars=[i0, loss0],
-                    parallel_iterations=hp.batch_size)
-                self.guided_attention_loss = loss / hp.batch_size
-                self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss
-
-    def add_optimizer(self, global_step):
-        '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
-
-        Args:
-          global_step: int32 scalar Tensor representing current global step in training
-        '''
-        with tf.variable_scope('optimizer') as _:
-            hp = self._hparams
-            if hp.decay_learning_rate:
-                self.learning_rate = _learning_rate_decay(
-                    hp.initial_learning_rate, global_step)
-            else:
-                self.learning_rate = tf.convert_to_tensor(
-                    hp.initial_learning_rate)
-            optimizer = tf.train.AdamOptimizer(self.learning_rate,
-                                               hp.adam_beta1, hp.adam_beta2)
-            gradients, variables = zip(*optimizer.compute_gradients(self.loss))
-            self.gradients = gradients
-            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
-
-            # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
-            # https://github.com/tensorflow/tensorflow/issues/1122
-            with tf.control_dependencies(
-                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
-                self.optimize = optimizer.apply_gradients(
-                    zip(clipped_gradients, variables), global_step=global_step)
-
-
-def _learning_rate_decay(init_lr, global_step):
-    # Noam scheme from tensor2tensor:
-    warmup_steps = 4000.0
-    step = tf.cast(global_step + 1, dtype=tf.float32)
-    return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5,
-                                                    step**-0.5)
--- a/modelscope/models/audio/tts/models/self_attention_decoder.py
+++ b/modelscope/models/audio/tts/models/self_attention_decoder.py
@@ -1,817 +0,0 @@
-"""Define self-attention decoder."""
-
-import sys
-
-import tensorflow as tf
-
-from . import compat, transformer
-from .am_models import decoder_prenet
-from .position import SinusoidalPositionEncoder
-
-
-class SelfAttentionDecoder():
-    """Decoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 prenet_units=256,
-                 dense_units=128,
-                 num_mels=80,
-                 outputs_per_step=3,
-                 X_band_width=None,
-                 H_band_width=None,
-                 position_encoder=SinusoidalPositionEncoder(),
-                 self_attention_type='scaled_dot'):
-        """Initializes the parameters of the decoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
-            insensitive).
-
-        Raises:
-          ValueError: if :obj:`self_attention_type` is invalid.
-        """
-        super(SelfAttentionDecoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-        self.self_attention_type = self_attention_type.lower()
-        if self.self_attention_type not in ('scaled_dot', 'average'):
-            raise ValueError('invalid attention type %s'
-                             % self.self_attention_type)
-        if self.self_attention_type == 'average':
-            tf.logging.warning(
-                'Support for average attention network is experimental '
-                'and may change in future versions.')
-        self.prenet_units = prenet_units
-        self.dense_units = dense_units
-        self.num_mels = num_mels
-        self.outputs_per_step = outputs_per_step
-        self.X_band_width = X_band_width
-        self.H_band_width = H_band_width
-
-    @property
-    def output_size(self):
-        """Returns the decoder output size."""
-        return self.num_units
-
-    @property
-    def support_alignment_history(self):
-        return True
-
-    @property
-    def support_multi_source(self):
-        return True
-
-    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
-        cache = {}
-
-        for layer in range(self.num_layers):
-            proj_cache_shape = [
-                batch_size, self.num_heads, 0, self.num_units // self.num_heads
-            ]
-            layer_cache = {}
-            layer_cache['memory'] = [{
-                'memory_keys':
-                tf.zeros(proj_cache_shape, dtype=dtype),
-                'memory_values':
-                tf.zeros(proj_cache_shape, dtype=dtype)
-            } for _ in range(num_sources)]
-            if self.self_attention_type == 'scaled_dot':
-                layer_cache['self_keys'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-                layer_cache['self_values'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-            elif self.self_attention_type == 'average':
-                layer_cache['prev_g'] = tf.zeros(
-                    [batch_size, 1, self.num_units], dtype=dtype)
-            cache['layer_{}'.format(layer)] = layer_cache
-
-        return cache
-
-    def _init_attn(self, dtype=tf.float32):
-        attn = []
-        for layer in range(self.num_layers):
-            attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True))
-        return attn
-
-    def _self_attention_stack(self,
-                              inputs,
-                              sequence_length=None,
-                              mode=True,
-                              cache=None,
-                              memory=None,
-                              memory_sequence_length=None,
-                              step=None):
-
-        # [N, T_out, self.dense_units] or [N, 1, self.dense_units]
-        prenet_outputs = decoder_prenet(inputs, self.prenet_units,
-                                        self.dense_units, mode)
-        if step is None:
-            decoder_inputs = tf.concat(
-                [memory, prenet_outputs],
-                axis=-1)  # [N, T_out, memory_size + self.dense_units]
-        else:
-            decoder_inputs = tf.concat(
-                [memory[:, step:step + 1, :], prenet_outputs],
-                axis=-1)  # [N, 1, memory_size + self.dense_units]
-        decoder_inputs = tf.layers.dense(
-            decoder_inputs, units=self.dense_units)
-
-        inputs = decoder_inputs
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(
-                inputs, position=step + 1 if step is not None else None)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        decoder_mask = None
-        memory_mask = None
-        # last_attention = None
-
-        X_band_width_tmp = -1
-        H_band_width_tmp = -1
-        if self.X_band_width is not None:
-            X_band_width_tmp = tf.cast(
-                tf.cond(
-                    tf.less(tf.shape(memory)[1], self.X_band_width),
-                    lambda: -1, lambda: self.X_band_width),
-                dtype=tf.int64)
-        if self.H_band_width is not None:
-            H_band_width_tmp = tf.cast(
-                tf.cond(
-                    tf.less(tf.shape(memory)[1], self.H_band_width),
-                    lambda: -1, lambda: self.H_band_width),
-                dtype=tf.int64)
-
-        if self.self_attention_type == 'scaled_dot':
-            if sequence_length is not None:
-                decoder_mask = transformer.build_future_mask(
-                    sequence_length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(inputs)[1],
-                    band=X_band_width_tmp)  # [N, 1, T_out, T_out]
-        elif self.self_attention_type == 'average':
-            if cache is None:
-                if sequence_length is None:
-                    sequence_length = tf.fill([tf.shape(inputs)[0]],
-                                              tf.shape(inputs)[1])
-                decoder_mask = transformer.cumulative_average_mask(
-                    sequence_length,
-                    maximum_length=tf.shape(inputs)[1],
-                    dtype=inputs.dtype)
-
-        if memory is not None and not tf.contrib.framework.nest.is_sequence(
-                memory):
-            memory = (memory, )
-        if memory_sequence_length is not None:
-            if not tf.contrib.framework.nest.is_sequence(
-                    memory_sequence_length):
-                memory_sequence_length = (memory_sequence_length, )
-            if step is None:
-                memory_mask = [
-                    transformer.build_history_mask(
-                        length,
-                        num_heads=self.num_heads,
-                        maximum_length=tf.shape(m)[1],
-                        band=H_band_width_tmp)
-                    for m, length in zip(memory, memory_sequence_length)
-                ]
-            else:
-                memory_mask = [
-                    transformer.build_history_mask(
-                        length,
-                        num_heads=self.num_heads,
-                        maximum_length=tf.shape(m)[1],
-                        band=H_band_width_tmp)[:, :, step:step + 1, :]
-                    for m, length in zip(memory, memory_sequence_length)
-                ]
-
-        # last_attention = None
-        attns_x = []
-        attns_h = []
-        for layer in range(self.num_layers):
-            layer_name = 'layer_{}'.format(layer)
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.variable_scope(layer_name):
-                if memory is not None:
-                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
-                        memory_cache = None
-                        if layer_cache is not None:
-                            memory_cache = layer_cache['memory'][i]
-                        scope_name = 'multi_head_{}'.format(i)
-                        if i == 0:
-                            scope_name = 'multi_head'
-                        with tf.variable_scope(scope_name):
-                            encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA(
-                                self.num_heads,
-                                transformer.norm(inputs),
-                                mem,
-                                mode,
-                                num_units=self.num_units,
-                                mask=decoder_mask,
-                                mask_h=mask,
-                                cache=layer_cache,
-                                cache_h=memory_cache,
-                                dropout=self.attention_dropout,
-                                return_attention=True,
-                                layer_name=layer_name,
-                                X_band_width=self.X_band_width)
-                            attns_x.append(attn_x)
-                            attns_h.append(attn_h)
-                            context = transformer.drop_and_add(
-                                inputs, encoded, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-
-        outputs = transformer.norm(inputs)
-        outputs = tf.layers.dense(
-            outputs, units=self.num_mels * self.outputs_per_step)
-        return outputs, attns_x, attns_h
-
-    def decode_from_inputs(self,
-                           inputs,
-                           sequence_length,
-                           initial_state=None,
-                           mode=True,
-                           memory=None,
-                           memory_sequence_length=None):
-        outputs, attention_x, attention_h = self._self_attention_stack(
-            inputs,
-            sequence_length=sequence_length,
-            mode=mode,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-        return outputs, attention_x, attention_h
-
-    def step_fn(self,
-                mode,
-                batch_size,
-                initial_state=None,
-                memory=None,
-                memory_sequence_length=None,
-                dtype=tf.float32):
-        if memory is None:
-            num_sources = 0
-        elif tf.contrib.framework.nest.is_sequence(memory):
-            num_sources = len(memory)
-        else:
-            num_sources = 1
-        cache = self._init_cache(
-            batch_size, dtype=dtype, num_sources=num_sources)
-        attention_x = self._init_attn(dtype=dtype)
-        attention_h = self._init_attn(dtype=dtype)
-
-        def _fn(step, inputs, cache):
-            outputs, attention_x, attention_h = self._self_attention_stack(
-                inputs,
-                mode=mode,
-                cache=cache,
-                memory=memory,
-                memory_sequence_length=memory_sequence_length,
-                step=step)
-            attention_x_tmp = []
-            for layer in range(len(attention_h)):
-                attention_x_tmp_l = tf.zeros_like(attention_h[layer])
-                if self.X_band_width is not None:
-                    pred = tf.less(step, self.X_band_width + 1)
-                    attention_x_tmp_l_1 = tf.cond(pred,  # yapf:disable
-                                                  lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer],
-                                                  lambda: tf.concat([
-                                                                    attention_x_tmp_l[:, :, :,
-                                                                                      :step - self.X_band_width],
-                                                                    attention_x_tmp_l[:, :, :,
-                                                                                      step - self.X_band_width:step + 1]
-                                                                    + attention_x[layer]],
-                                                                    axis=-1))  # yapf:disable
-                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
-                    attention_x_tmp.append(
-                        tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2],
-                                  axis=-1))
-                else:
-                    attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1]
-                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
-                    attention_x_tmp.append(
-                        tf.concat([
-                            attention_x_tmp_l_1 + attention_x[layer],
-                            attention_x_tmp_l_2
-                        ], axis=-1))  # yapf:disable
-            attention_x = attention_x_tmp
-            return outputs, cache, attention_x, attention_h
-
-        return _fn, cache, attention_x, attention_h
-
-    def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations,
-                                  mode, memory, memory_sequence_length):
-        batch_size = tf.shape(init_decoder_input)[0]
-        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
-            mode,
-            batch_size,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-
-        outputs, attention_x, attention_h, cache = self.dynamic_decode(
-            step_fn,
-            init_decoder_input,
-            init_cache=init_cache,
-            init_attn_x=init_attn_x,
-            init_attn_h=init_attn_h,
-            maximum_iterations=maximum_iterations,
-            batch_size=batch_size)
-        return outputs, attention_x, attention_h
-
-    def dynamic_decode_and_search_teacher_forcing(self, decoder_input,
-                                                  maximum_iterations, mode,
-                                                  memory,
-                                                  memory_sequence_length):
-        batch_size = tf.shape(decoder_input)[0]
-        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
-            mode,
-            batch_size,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-
-        outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing(
-            step_fn,
-            decoder_input,
-            init_cache=init_cache,
-            init_attn_x=init_attn_x,
-            init_attn_h=init_attn_h,
-            maximum_iterations=maximum_iterations,
-            batch_size=batch_size)
-        return outputs, attention_x, attention_h
-
-    def dynamic_decode(self,
-                       step_fn,
-                       init_decoder_input,
-                       init_cache=None,
-                       init_attn_x=None,
-                       init_attn_h=None,
-                       maximum_iterations=None,
-                       batch_size=None):
-
-        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
-            return tf.less(step, maximum_iterations)
-
-        def _body(step, cache, inputs, outputs, attention_x, attention_h):
-            # output: [1, 1, num_mels * r]
-            # attn: [1, 1, T_out]
-            output, cache, attn_x, attn_h = step_fn(
-                step, inputs, cache)  # outputs, cache, attention, attns
-            for layer in range(len(attention_x)):
-                attention_x[layer] = attention_x[layer].write(
-                    step, tf.cast(attn_x[layer], tf.float32))
-
-            for layer in range(len(attention_h)):
-                attention_h[layer] = attention_h[layer].write(
-                    step, tf.cast(attn_h[layer], tf.float32))
-
-            outputs = outputs.write(step, tf.cast(output, tf.float32))
-            return step + 1, cache, output[:, :, -self.
-                                           num_mels:], outputs, attention_x, attention_h
-
-        step = tf.constant(0, dtype=tf.int32)
-        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
-
-        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
-            _cond,
-            _body,
-            loop_vars=(step, init_cache, init_decoder_input, outputs,
-                       init_attn_x, init_attn_h),
-            shape_invariants=(step.shape,
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_cache),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants,
-                                  init_decoder_input), tf.TensorShape(None),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_x),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_h)),
-            parallel_iterations=1,
-            back_prop=False,
-            maximum_iterations=maximum_iterations)
-        # element of outputs: [N, 1, num_mels * r]
-        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
-        outputs_stack = tf.transpose(
-            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
-        outputs_stack = tf.squeeze(
-            outputs_stack, axis=0)  # [N, T_out, num_mels * r]
-
-        attention_x_stack = []
-        for layer in range(len(attention_x)):
-            attention_x_stack_tmp = attention_x[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_x_stack_tmp = tf.transpose(
-                attention_x_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_x_stack_tmp = tf.squeeze(
-                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_x_stack.append(attention_x_stack_tmp)
-
-        attention_h_stack = []
-        for layer in range(len(attention_h)):
-            attention_h_stack_tmp = attention_h[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_h_stack_tmp = tf.transpose(
-                attention_h_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_h_stack_tmp = tf.squeeze(
-                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_h_stack.append(attention_h_stack_tmp)
-
-        return outputs_stack, attention_x_stack, attention_h_stack, cache
-
-    def dynamic_decode_teacher_forcing(self,
-                                       step_fn,
-                                       decoder_input,
-                                       init_cache=None,
-                                       init_attn_x=None,
-                                       init_attn_h=None,
-                                       maximum_iterations=None,
-                                       batch_size=None):
-
-        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
-            return tf.less(step, maximum_iterations)
-
-        def _body(step, cache, inputs, outputs, attention_x, attention_h):
-            # output: [1, 1, num_mels * r]
-            # attn: [1, 1, T_out]
-            output, cache, attn_x, attn_h = step_fn(
-                step, inputs[:, step:step + 1, :],
-                cache)  # outputs, cache, attention, attns
-            for layer in range(len(attention_x)):
-                attention_x[layer] = attention_x[layer].write(
-                    step, tf.cast(attn_x[layer], tf.float32))
-
-            for layer in range(len(attention_h)):
-                attention_h[layer] = attention_h[layer].write(
-                    step, tf.cast(attn_h[layer], tf.float32))
-            outputs = outputs.write(step, tf.cast(output, tf.float32))
-            return step + 1, cache, inputs, outputs, attention_x, attention_h
-
-        step = tf.constant(0, dtype=tf.int32)
-        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
-
-        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
-            _cond,
-            _body,
-            loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x,
-                       init_attn_h),
-            shape_invariants=(step.shape,
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants,
-                                  init_cache), decoder_input.shape,
-                              tf.TensorShape(None),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_x),
-                              compat.nest.map_structure(
-                                  self._get_shape_invariants, init_attn_h)),
-            parallel_iterations=1,
-            back_prop=False,
-            maximum_iterations=maximum_iterations)
-        # element of outputs: [N, 1, num_mels * r]
-        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
-        outputs_stack = tf.transpose(
-            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
-        outputs_stack = tf.squeeze(
-            outputs_stack, axis=0)  # [N, T_out, num_mels * r]
-
-        attention_x_stack = []
-        for layer in range(len(attention_x)):
-            attention_x_stack_tmp = attention_x[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_x_stack_tmp = tf.transpose(
-                attention_x_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_x_stack_tmp = tf.squeeze(
-                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_x_stack.append(attention_x_stack_tmp)
-
-        attention_h_stack = []
-        for layer in range(len(attention_h)):
-            attention_h_stack_tmp = attention_h[layer].stack(
-            )  # [T_out, N, H, 1, T_out]
-            attention_h_stack_tmp = tf.transpose(
-                attention_h_stack_tmp, perm=[3, 1, 2, 0,
-                                             4])  # [1, N, H, T_out, T_out]
-            attention_h_stack_tmp = tf.squeeze(
-                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
-            attention_h_stack.append(attention_h_stack_tmp)
-
-        return outputs_stack, attention_x_stack, attention_h_stack, cache
-
-    def _get_shape_invariants(self, tensor):
-        """Returns the shape of the tensor but sets middle dims to None."""
-        if isinstance(tensor, tf.TensorArray):
-            shape = None
-        else:
-            shape = tensor.shape.as_list()
-            for i in range(1, len(shape) - 1):
-                shape[i] = None
-        return tf.TensorShape(shape)
-
-
-class SelfAttentionDecoderOri():
-    """Decoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder(),
-                 self_attention_type='scaled_dot'):
-        """Initializes the parameters of the decoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
-            insensitive).
-
-        Raises:
-          ValueError: if :obj:`self_attention_type` is invalid.
-        """
-        super(SelfAttentionDecoderOri, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-        self.self_attention_type = self_attention_type.lower()
-        if self.self_attention_type not in ('scaled_dot', 'average'):
-            raise ValueError('invalid attention type %s'
-                             % self.self_attention_type)
-        if self.self_attention_type == 'average':
-            tf.logging.warning(
-                'Support for average attention network is experimental '
-                'and may change in future versions.')
-
-    @property
-    def output_size(self):
-        """Returns the decoder output size."""
-        return self.num_units
-
-    @property
-    def support_alignment_history(self):
-        return True
-
-    @property
-    def support_multi_source(self):
-        return True
-
-    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
-        cache = {}
-
-        for layer in range(self.num_layers):
-            proj_cache_shape = [
-                batch_size, self.num_heads, 0, self.num_units // self.num_heads
-            ]
-            layer_cache = {}
-            layer_cache['memory'] = [{
-                'memory_keys':
-                tf.zeros(proj_cache_shape, dtype=dtype),
-                'memory_values':
-                tf.zeros(proj_cache_shape, dtype=dtype)
-            } for _ in range(num_sources)]
-            if self.self_attention_type == 'scaled_dot':
-                layer_cache['self_keys'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-                layer_cache['self_values'] = tf.zeros(
-                    proj_cache_shape, dtype=dtype)
-            elif self.self_attention_type == 'average':
-                layer_cache['prev_g'] = tf.zeros(
-                    [batch_size, 1, self.num_units], dtype=dtype)
-            cache['layer_{}'.format(layer)] = layer_cache
-
-        return cache
-
-    def _self_attention_stack(self,
-                              inputs,
-                              sequence_length=None,
-                              mode=True,
-                              cache=None,
-                              memory=None,
-                              memory_sequence_length=None,
-                              step=None):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(
-                inputs, position=step + 1 if step is not None else None)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-
-        decoder_mask = None
-        memory_mask = None
-        last_attention = None
-
-        if self.self_attention_type == 'scaled_dot':
-            if sequence_length is not None:
-                decoder_mask = transformer.build_future_mask(
-                    sequence_length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(inputs)[1])
-        elif self.self_attention_type == 'average':
-            if cache is None:
-                if sequence_length is None:
-                    sequence_length = tf.fill([tf.shape(inputs)[0]],
-                                              tf.shape(inputs)[1])
-                decoder_mask = transformer.cumulative_average_mask(
-                    sequence_length,
-                    maximum_length=tf.shape(inputs)[1],
-                    dtype=inputs.dtype)
-
-        if memory is not None and not tf.contrib.framework.nest.is_sequence(
-                memory):
-            memory = (memory, )
-        if memory_sequence_length is not None:
-            if not tf.contrib.framework.nest.is_sequence(
-                    memory_sequence_length):
-                memory_sequence_length = (memory_sequence_length, )
-            memory_mask = [
-                transformer.build_sequence_mask(
-                    length,
-                    num_heads=self.num_heads,
-                    maximum_length=tf.shape(m)[1])
-                for m, length in zip(memory, memory_sequence_length)
-            ]
-
-        for layer in range(self.num_layers):
-            layer_name = 'layer_{}'.format(layer)
-            layer_cache = cache[layer_name] if cache is not None else None
-            with tf.variable_scope(layer_name):
-                if self.self_attention_type == 'scaled_dot':
-                    with tf.variable_scope('masked_multi_head'):
-                        encoded = transformer.multi_head_attention(
-                            self.num_heads,
-                            transformer.norm(inputs),
-                            None,
-                            mode,
-                            num_units=self.num_units,
-                            mask=decoder_mask,
-                            cache=layer_cache,
-                            dropout=self.attention_dropout)
-                        last_context = transformer.drop_and_add(
-                            inputs, encoded, mode, dropout=self.dropout)
-                elif self.self_attention_type == 'average':
-                    with tf.variable_scope('average_attention'):
-                        # Cumulative average.
-                        x = transformer.norm(inputs)
-                        y = transformer.cumulative_average(
-                            x,
-                            decoder_mask if cache is None else step,
-                            cache=layer_cache)
-                        # FFN.
-                        y = transformer.feed_forward(
-                            y,
-                            self.ffn_inner_dim,
-                            mode,
-                            dropout=self.relu_dropout)
-                        # Gating layer.
-                        z = tf.layers.dense(
-                            tf.concat([x, y], -1), self.num_units * 2)
-                        i, f = tf.split(z, 2, axis=-1)
-                        y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
-                        last_context = transformer.drop_and_add(
-                            inputs, y, mode, dropout=self.dropout)
-
-                if memory is not None:
-                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
-                        memory_cache = layer_cache['memory'][i] if layer_cache is not None else None  # yapf:disable
-                        with tf.variable_scope('multi_head' if i
-                                               == 0 else 'multi_head_%d' % i):  # yapf:disable
-                            context, last_attention = transformer.multi_head_attention(
-                                self.num_heads,
-                                transformer.norm(last_context),
-                                mem,
-                                mode,
-                                mask=mask,
-                                cache=memory_cache,
-                                dropout=self.attention_dropout,
-                                return_attention=True)
-                            last_context = transformer.drop_and_add(
-                                last_context,
-                                context,
-                                mode,
-                                dropout=self.dropout)
-                            if i > 0:  # Do not return attention in case of multi source.
-                                last_attention = None
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(last_context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        last_context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-
-        if last_attention is not None:
-            # The first head of the last layer is returned.
-            first_head_attention = last_attention[:, 0]
-        else:
-            first_head_attention = None
-
-        outputs = transformer.norm(inputs)
-        return outputs, first_head_attention
-
-    def decode_from_inputs(self,
-                           inputs,
-                           sequence_length,
-                           initial_state=None,
-                           mode=True,
-                           memory=None,
-                           memory_sequence_length=None):
-        outputs, attention = self._self_attention_stack(
-            inputs,
-            sequence_length=sequence_length,
-            mode=mode,
-            memory=memory,
-            memory_sequence_length=memory_sequence_length)
-        return outputs, None, attention
-
-    def step_fn(self,
-                mode,
-                batch_size,
-                initial_state=None,
-                memory=None,
-                memory_sequence_length=None,
-                dtype=tf.float32):
-        if memory is None:
-            num_sources = 0
-        elif tf.contrib.framework.nest.is_sequence(memory):
-            num_sources = len(memory)
-        else:
-            num_sources = 1
-        cache = self._init_cache(
-            batch_size, dtype=dtype, num_sources=num_sources)
-
-        def _fn(step, inputs, cache, mode):
-            inputs = tf.expand_dims(inputs, 1)
-            outputs, attention = self._self_attention_stack(
-                inputs,
-                mode=mode,
-                cache=cache,
-                memory=memory,
-                memory_sequence_length=memory_sequence_length,
-                step=step)
-            outputs = tf.squeeze(outputs, axis=1)
-            if attention is not None:
-                attention = tf.squeeze(attention, axis=1)
-            return outputs, cache, attention
-
-        return _fn, cache
--- a/modelscope/models/audio/tts/models/self_attention_encoder.py
+++ b/modelscope/models/audio/tts/models/self_attention_encoder.py
@@ -1,182 +0,0 @@
-"""Define the self-attention encoder."""
-
-import tensorflow as tf
-
-from . import transformer
-from .position import SinusoidalPositionEncoder
-
-
-class SelfAttentionEncoder():
-    """Encoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder()):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(SelfAttentionEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-        mask = transformer.build_sequence_mask(
-            sequence_length,
-            num_heads=self.num_heads,
-            maximum_length=tf.shape(inputs)[1])
-
-        mask_FF = tf.squeeze(
-            transformer.build_sequence_mask(
-                sequence_length, maximum_length=tf.shape(inputs)[1]),
-            axis=1)
-
-        state = ()
-
-        attns = []
-        for layer in range(self.num_layers):
-            with tf.variable_scope('layer_{}'.format(layer)):
-                with tf.variable_scope('multi_head'):
-                    context, attn = transformer.multi_head_attention(
-                        self.num_heads,
-                        transformer.norm(inputs),
-                        None,
-                        mode,
-                        num_units=self.num_units,
-                        mask=mask,
-                        dropout=self.attention_dropout,
-                        return_attention=True)
-                    attns.append(attn)
-                    context = transformer.drop_and_add(
-                        inputs, context, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout,
-                        mask=mask_FF)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = transformer.norm(inputs)
-        return (outputs, state, sequence_length, attns)
-
-
-class SelfAttentionEncoderOri():
-    """Encoder using self-attention as described in
-    https://arxiv.org/abs/1706.03762.
-    """
-
-    def __init__(self,
-                 num_layers,
-                 num_units=512,
-                 num_heads=8,
-                 ffn_inner_dim=2048,
-                 dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 position_encoder=SinusoidalPositionEncoder()):
-        """Initializes the parameters of the encoder.
-
-        Args:
-          num_layers: The number of layers.
-          num_units: The number of hidden units.
-          num_heads: The number of heads in the multi-head attention.
-          ffn_inner_dim: The number of units of the inner linear transformation
-            in the feed forward layer.
-          dropout: The probability to drop units from the outputs.
-          attention_dropout: The probability to drop units from the attention.
-          relu_dropout: The probability to drop units from the ReLU activation in
-            the feed forward layer.
-          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
-            apply on inputs or ``None``.
-        """
-        super(SelfAttentionEncoderOri, self).__init__()
-        self.num_layers = num_layers
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.ffn_inner_dim = ffn_inner_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.relu_dropout = relu_dropout
-        self.position_encoder = position_encoder
-
-    def encode(self, inputs, sequence_length=None, mode=True):
-        inputs *= self.num_units**0.5
-        if self.position_encoder is not None:
-            inputs = self.position_encoder(inputs)
-
-        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
-        mask = transformer.build_sequence_mask(
-            sequence_length,
-            num_heads=self.num_heads,
-            maximum_length=tf.shape(inputs)[1])  # [N, 1, 1, T_out]
-
-        state = ()
-
-        attns = []
-        for layer in range(self.num_layers):
-            with tf.variable_scope('layer_{}'.format(layer)):
-                with tf.variable_scope('multi_head'):
-                    context, attn = transformer.multi_head_attention(
-                        self.num_heads,
-                        transformer.norm(inputs),
-                        None,
-                        mode,
-                        num_units=self.num_units,
-                        mask=mask,
-                        dropout=self.attention_dropout,
-                        return_attention=True)
-                    attns.append(attn)
-                    context = transformer.drop_and_add(
-                        inputs, context, mode, dropout=self.dropout)
-
-                with tf.variable_scope('ffn'):
-                    transformed = transformer.feed_forward_ori(
-                        transformer.norm(context),
-                        self.ffn_inner_dim,
-                        mode,
-                        dropout=self.relu_dropout)
-                    transformed = transformer.drop_and_add(
-                        context, transformed, mode, dropout=self.dropout)
-
-                inputs = transformed
-                state += (tf.reduce_mean(inputs, axis=1), )
-
-        outputs = transformer.norm(inputs)
-        return (outputs, state, sequence_length, attns)
--- a/modelscope/models/audio/tts/models/transformer.py
+++ b/modelscope/models/audio/tts/models/transformer.py
--- a/modelscope/models/audio/tts/models/utils.py
+++ b/modelscope/models/audio/tts/models/utils.py
@@ -1,59 +0,0 @@
-import glob
-import os
-
-import matplotlib
-import matplotlib.pylab as plt
-import torch
-from torch.nn.utils import weight_norm
-
-matplotlib.use('Agg')
-
-
-def plot_spectrogram(spectrogram):
-    fig, ax = plt.subplots(figsize=(10, 2))
-    im = ax.imshow(
-        spectrogram, aspect='auto', origin='lower', interpolation='none')
-    plt.colorbar(im, ax=ax)
-
-    fig.canvas.draw()
-    plt.close()
-
-    return fig
-
-
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find('Conv') != -1:
-        m.weight.data.normal_(mean, std)
-
-
-def apply_weight_norm(m):
-    classname = m.__class__.__name__
-    if classname.find('Conv') != -1:
-        weight_norm(m)
-
-
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-
-
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    print("Loading '{}'".format(filepath))
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    print('Complete.')
-    return checkpoint_dict
-
-
-def save_checkpoint(filepath, obj):
-    print('Saving checkpoint to {}'.format(filepath))
-    torch.save(obj, filepath)
-    print('Complete.')
-
-
-def scan_checkpoint(cp_dir, prefix):
-    pattern = os.path.join(cp_dir, prefix + '????????')
-    cp_list = glob.glob(pattern)
-    if len(cp_list) == 0:
-        return None
-    return sorted(cp_list)[-1]
--- a/modelscope/models/audio/tts/models/utils/init.py
+++ b/modelscope/models/audio/tts/models/utils/init.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .utils import *  # noqa F403
--- a/modelscope/models/audio/tts/models/utils/utils.py
+++ b/modelscope/models/audio/tts/models/utils/utils.py
@@ -0,0 +1,136 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import glob
+import os
+import shutil
+
+import matplotlib
+import matplotlib.pylab as plt
+import torch
+
+matplotlib.use('Agg')
+
+
+class AttrDict(dict):
+
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
+
+
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(
+        spectrogram, aspect='auto', origin='lower', interpolation='none')
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def plot_alignment(alignment, info=None):
+    fig, ax = plt.subplots()
+    im = ax.imshow(
+        alignment, aspect='auto', origin='lower', interpolation='none')
+    fig.colorbar(im, ax=ax)
+    xlabel = 'Input timestep'
+    if info is not None:
+        xlabel += '\t' + info
+    plt.xlabel(xlabel)
+    plt.ylabel('Output timestep')
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+    torch.save(obj, filepath)
+
+
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????.pkl')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+class ValueWindow():
+
+    def __init__(self, window_size=100):
+        self._window_size = window_size
+        self._values = []
+
+    def append(self, x):
+        self._values = self._values[-(self._window_size - 1):] + [x]
+
+    @property
+    def sum(self):
+        return sum(self._values)
+
+    @property
+    def count(self):
+        return len(self._values)
+
+    @property
+    def average(self):
+        return self.sum / max(1, self.count)
+
+    def reset(self):
+        self._values = []
+
+
+def get_model_size(model):
+    param_num = sum([p.numel() for p in model.parameters() if p.requires_grad])
+    param_size = param_num * 4 / 1024 / 1024
+    return param_size
+
+
+def get_grad_norm(model):
+    total_norm = 0
+    params = [
+        p for p in model.parameters() if p.grad is not None and p.requires_grad
+    ]
+    for p in params:
+        param_norm = p.grad.detach().data.norm(2)
+        total_norm += param_norm.item()**2
+    total_norm = total_norm**0.5
+    return total_norm
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_mask_from_lengths(lengths, max_len=None):
+    batch_size = lengths.shape[0]
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+
+    ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size,
+                                                       -1).to(lengths.device)
+    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
+
+    return mask
--- a/modelscope/models/audio/tts/models/vocoder_models.py
+++ b/modelscope/models/audio/tts/models/vocoder_models.py
@@ -1,516 +0,0 @@
-from distutils.version import LooseVersion
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
-from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
-
-from .utils import get_padding, init_weights
-
-is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')
-
-
-def stft(x, fft_size, hop_size, win_length, window):
-    """Perform STFT and convert to magnitude spectrogram.
-
-    Args:
-        x (Tensor): Input signal tensor (B, T).
-        fft_size (int): FFT size.
-        hop_size (int): Hop size.
-        win_length (int): Window length.
-        window (str): Window function type.
-
-    Returns:
-        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
-
-    """
-    if is_pytorch_17plus:
-        x_stft = torch.stft(
-            x, fft_size, hop_size, win_length, window, return_complex=False)
-    else:
-        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
-    real = x_stft[..., 0]
-    imag = x_stft[..., 1]
-
-    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
-    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
-
-
-LRELU_SLOPE = 0.1
-
-
-def get_padding_casual(kernel_size, dilation=1):
-    return int(kernel_size * dilation - dilation)
-
-
-class Conv1dCasual(torch.nn.Module):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 padding_mode='zeros'):
-        super(Conv1dCasual, self).__init__()
-        self.pad = padding
-        self.conv1d = weight_norm(
-            Conv1d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding=0,
-                dilation=dilation,
-                groups=groups,
-                bias=bias,
-                padding_mode=padding_mode))
-        self.conv1d.apply(init_weights)
-
-    def forward(self, x):  # bdt
-        # described starting from the last dimension and moving forward.
-        x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
-        x = self.conv1d(x)
-        return x
-
-    def remove_weight_norm(self):
-        remove_weight_norm(self.conv1d)
-
-
-class ConvTranspose1dCausal(torch.nn.Module):
-    """CausalConvTranspose1d module with customized initialization."""
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding=0):
-        """Initialize CausalConvTranspose1d module."""
-        super(ConvTranspose1dCausal, self).__init__()
-        self.deconv = weight_norm(
-            ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
-        self.stride = stride
-        self.deconv.apply(init_weights)
-        self.pad = kernel_size - stride
-
-    def forward(self, x):
-        """Calculate forward propagation.
-        Args:
-            x (Tensor): Input tensor (B, in_channels, T_in).
-        Returns:
-            Tensor: Output tensor (B, out_channels, T_out).
-        """
-        # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
-        return self.deconv(x)[:, :, :-self.pad]
-
-    def remove_weight_norm(self):
-        remove_weight_norm(self.deconv)
-
-
-class ResBlock1(torch.nn.Module):
-
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList([
-            Conv1dCasual(
-                channels,
-                channels,
-                kernel_size,
-                1,
-                dilation=dilation[i],
-                padding=get_padding_casual(kernel_size, dilation[i]))
-            for i in range(len(dilation))
-        ])
-
-        self.convs2 = nn.ModuleList([
-            Conv1dCasual(
-                channels,
-                channels,
-                kernel_size,
-                1,
-                dilation=1,
-                padding=get_padding_casual(kernel_size, 1))
-            for i in range(len(dilation))
-        ])
-
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for layer in self.convs1:
-            layer.remove_weight_norm()
-        for layer in self.convs2:
-            layer.remove_weight_norm()
-
-
-class Generator(torch.nn.Module):
-
-    def __init__(self, h):
-        super(Generator, self).__init__()
-        self.h = h
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-        print('num_kernels={}, num_upsamples={}'.format(
-            self.num_kernels, self.num_upsamples))
-        self.conv_pre = Conv1dCasual(
-            80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
-        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
-
-        self.ups = nn.ModuleList()
-        self.repeat_ups = nn.ModuleList()
-        for i, (u, k) in enumerate(
-                zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            upsample = nn.Sequential(
-                nn.Upsample(mode='nearest', scale_factor=u),
-                nn.LeakyReLU(LRELU_SLOPE),
-                Conv1dCasual(
-                    h.upsample_initial_channel // (2**i),
-                    h.upsample_initial_channel // (2**(i + 1)),
-                    kernel_size=7,
-                    stride=1,
-                    padding=7 - 1))
-            self.repeat_ups.append(upsample)
-            self.ups.append(
-                ConvTranspose1dCausal(
-                    h.upsample_initial_channel // (2**i),
-                    h.upsample_initial_channel // (2**(i + 1)),
-                    k,
-                    u,
-                    padding=(k - u) // 2))
-
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel // (2**(i + 1))
-            for j, (k, d) in enumerate(
-                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
-                self.resblocks.append(resblock(h, ch, k, d))
-
-        self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)
-
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = torch.sin(x) + x
-            # transconv
-            x1 = F.leaky_relu(x, LRELU_SLOPE)
-            x1 = self.ups[i](x1)
-            # repeat
-            x2 = self.repeat_ups[i](x)
-            x = x1 + x2
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for layer in self.ups:
-            layer.remove_weight_norm()
-        for layer in self.repeat_ups:
-            layer[-1].remove_weight_norm()
-        for layer in self.resblocks:
-            layer.remove_weight_norm()
-        self.conv_pre.remove_weight_norm()
-        self.conv_post.remove_weight_norm()
-
-
-class DiscriminatorP(torch.nn.Module):
-
-    def __init__(self,
-                 period,
-                 kernel_size=5,
-                 stride=3,
-                 use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(
-                Conv2d(
-                    1,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    128, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    128,
-                    512, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(
-                Conv2d(
-                    512,
-                    1024, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-
-    def forward(self, x):
-        fmap = []
-
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), 'reflect')
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiPeriodDiscriminator(torch.nn.Module):
-
-    def __init__(self):
-        super(MultiPeriodDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorP(2),
-            DiscriminatorP(3),
-            DiscriminatorP(5),
-            DiscriminatorP(7),
-            DiscriminatorP(11),
-        ])
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class DiscriminatorS(torch.nn.Module):
-
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
-            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
-            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
-            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-        ])
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-
-    def forward(self, x):
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class MultiScaleDiscriminator(torch.nn.Module):
-
-    def __init__(self):
-        super(MultiScaleDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorS(use_spectral_norm=True),
-            DiscriminatorS(),
-            DiscriminatorS(),
-        ])
-        from pytorch_wavelets import DWT1DForward
-        self.meanpools = nn.ModuleList(
-            [DWT1DForward(wave='db3', J=1),
-             DWT1DForward(wave='db3', J=1)])
-        self.convs = nn.ModuleList([
-            weight_norm(Conv1d(2, 1, 15, 1, padding=7)),
-            weight_norm(Conv1d(2, 1, 15, 1, padding=7))
-        ])
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            if i != 0:
-                yl, yh = self.meanpools[i - 1](y)
-                y = torch.cat([yl, yh[0]], dim=1)
-                y = self.convs[i - 1](y)
-                y = F.leaky_relu(y, LRELU_SLOPE)
-
-                yl_hat, yh_hat = self.meanpools[i - 1](y_hat)
-                y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1)
-                y_hat = self.convs[i - 1](y_hat)
-                y_hat = F.leaky_relu(y_hat, LRELU_SLOPE)
-
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-class DiscriminatorSTFT(torch.nn.Module):
-
-    def __init__(self,
-                 kernel_size=11,
-                 stride=2,
-                 use_spectral_norm=False,
-                 fft_size=1024,
-                 shift_size=120,
-                 win_length=600,
-                 window='hann_window'):
-        super(DiscriminatorSTFT, self).__init__()
-        self.fft_size = fft_size
-        self.shift_size = shift_size
-        self.win_length = win_length
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(
-                Conv2d(
-                    fft_size // 2 + 1,
-                    32, (15, 1), (1, 1),
-                    padding=(get_padding(15, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(
-                Conv2d(
-                    32,
-                    32, (kernel_size, 1), (stride, 1),
-                    padding=(get_padding(9, 1), 0))),
-            norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0)))
-        self.register_buffer('window', getattr(torch, window)(win_length))
-
-    def forward(self, wav):
-        wav = torch.squeeze(wav, 1)
-        x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        x = torch.transpose(x_mag, 2, 1).unsqueeze(-1)
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = x.squeeze(-1)
-
-        return x, fmap
-
-
-class MultiSTFTDiscriminator(torch.nn.Module):
-
-    def __init__(
-        self,
-        fft_sizes=[1024, 2048, 512],
-        hop_sizes=[120, 240, 50],
-        win_lengths=[600, 1200, 240],
-        window='hann_window',
-    ):
-        super(MultiSTFTDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList()
-        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
-            self.discriminators += [
-                DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl)
-            ]
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-
-
-def feature_loss(fmap_r, fmap_g):
-    loss = 0
-    for dr, dg in zip(fmap_r, fmap_g):
-        for rl, gl in zip(dr, dg):
-            loss += torch.mean(torch.abs(rl - gl))
-
-    return loss * 2
-
-
-def discriminator_loss(disc_real_outputs, disc_generated_outputs):
-    loss = 0
-    r_losses = []
-    g_losses = []
-    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
-        r_loss = torch.mean((1 - dr)**2)
-        g_loss = torch.mean(dg**2)
-        loss += (r_loss + g_loss)
-        r_losses.append(r_loss.item())
-        g_losses.append(g_loss.item())
-
-    return loss, r_losses, g_losses
-
-
-def generator_loss(disc_outputs):
-    loss = 0
-    gen_losses = []
-    for dg in disc_outputs:
-        temp_loss = torch.mean((1 - dg)**2)
-        gen_losses.append(temp_loss)
-        loss += temp_loss
-
-    return loss, gen_losses
--- a/modelscope/models/audio/tts/sambert_hifi.py
+++ b/modelscope/models/audio/tts/sambert_hifi.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
 import os
@@ -11,13 +13,11 @@ from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import (
    TtsFrontendInitializeFailedException,
-    TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion,
+    TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationException,
    TtsVoiceNotExistsException)
 from modelscope.utils.constant import Tasks
 from .voice import Voice

-import tensorflow as tf  # isort:skip
-
 __all__ = ['SambertHifigan']


@@ -28,14 +28,15 @@ class SambertHifigan(Model):
    def __init__(self, model_dir, *args, **kwargs):
        super().__init__(model_dir, *args, **kwargs)
        if 'am' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing am!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing am!')
        if 'vocoder' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing vocoder!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing vocoder!')
        if 'lang_type' not in kwargs:
-            raise TtsModelConfigurationExcetion(
-                'configuration model field missing lang_type!')
+            raise TtsModelConfigurationException(
+                'modelscope error: configuration model field missing lang_type!'
+            )
        am_cfg = kwargs['am']
        voc_cfg = kwargs['vocoder']
        # initialize frontend
@@ -47,10 +48,12 @@ class SambertHifigan(Model):
            zip_ref.extractall(model_dir)
        if not frontend.initialize(self.__res_path):
            raise TtsFrontendInitializeFailedException(
-                'resource invalid: {}'.format(self.__res_path))
+                'modelscope error: resource invalid: {}'.format(
+                    self.__res_path))
        if not frontend.set_lang_type(kwargs['lang_type']):
            raise TtsFrontendLanguageTypeInvalidException(
-                'language type invalid: {}'.format(kwargs['lang_type']))
+                'modelscope error: language type invalid: {}'.format(
+                    kwargs['lang_type']))
        self.__frontend = frontend
        zip_file = os.path.join(model_dir, 'voices.zip')
        self.__voice_path = os.path.join(model_dir, 'voices')
@@ -60,7 +63,8 @@ class SambertHifigan(Model):
        with open(voice_cfg_path, 'r') as f:
            voice_cfg = json.load(f)
        if 'voices' not in voice_cfg:
-            raise TtsModelConfigurationExcetion('voices invalid')
+            raise TtsModelConfigurationException(
+                'modelscope error: voices invalid')
        self.__voice = {}
        for name in voice_cfg['voices']:
            voice_path = os.path.join(self.__voice_path, name)
@@ -70,11 +74,13 @@ class SambertHifigan(Model):
        if voice_cfg['voices']:
            self.__default_voice_name = voice_cfg['voices'][0]
        else:
-            raise TtsVoiceNotExistsException('voices is empty in voices.json')
+            raise TtsVoiceNotExistsException(
+                'modelscope error: voices is empty in voices.json')

    def __synthesis_one_sentences(self, voice_name, text):
        if voice_name not in self.__voice:
-            raise TtsVoiceNotExistsException(f'Voice {voice_name} not exists')
+            raise TtsVoiceNotExistsException(
+                f'modelscope error: Voice {voice_name} not exists')
        return self.__voice[voice_name].forward(text)

    def forward(self, text: str, voice_name: str = None):
--- a/modelscope/models/audio/tts/text/cleaners.py
+++ b/modelscope/models/audio/tts/text/cleaners.py
@@ -1,89 +0,0 @@
-'''
-Cleaners are transformations that run over the input text at both training and eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-'''
-
-import re
-
-from unidecode import unidecode
-
-from .numbers import normalize_numbers
-
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
-
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-                  for x in [
-                      ('mrs', 'misess'),
-                      ('mr', 'mister'),
-                      ('dr', 'doctor'),
-                      ('st', 'saint'),
-                      ('co', 'company'),
-                      ('jr', 'junior'),
-                      ('maj', 'major'),
-                      ('gen', 'general'),
-                      ('drs', 'doctors'),
-                      ('rev', 'reverend'),
-                      ('lt', 'lieutenant'),
-                      ('hon', 'honorable'),
-                      ('sgt', 'sergeant'),
-                      ('capt', 'captain'),
-                      ('esq', 'esquire'),
-                      ('ltd', 'limited'),
-                      ('col', 'colonel'),
-                      ('ft', 'fort'), ]]  # yapf:disable
-
-
-def expand_abbreviations(text):
-    for regex, replacement in _abbreviations:
-        text = re.sub(regex, replacement, text)
-    return text
-
-
-def expand_numbers(text):
-    return normalize_numbers(text)
-
-
-def lowercase(text):
-    return text.lower()
-
-
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, ' ', text)
-
-
-def convert_to_ascii(text):
-    return unidecode(text)
-
-
-def basic_cleaners(text):
-    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def transliteration_cleaners(text):
-    '''Pipeline for non-English text that transliterates to ASCII.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def english_cleaners(text):
-    '''Pipeline for English text, including number and abbreviation expansion.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = expand_numbers(text)
-    text = expand_abbreviations(text)
-    text = collapse_whitespace(text)
-    return text
--- a/modelscope/models/audio/tts/text/cmudict.py
+++ b/modelscope/models/audio/tts/text/cmudict.py
@@ -1,64 +0,0 @@
-import re
-
-valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
-    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
-    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
-    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
-    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
-    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
-    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
-    'Y', 'Z', 'ZH'
-]
-
-_valid_symbol_set = set(valid_symbols)
-
-
-class CMUDict:
-    '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
-
-    def __init__(self, file_or_path, keep_ambiguous=True):
-        if isinstance(file_or_path, str):
-            with open(file_or_path, encoding='latin-1') as f:
-                entries = _parse_cmudict(f)
-        else:
-            entries = _parse_cmudict(file_or_path)
-        if not keep_ambiguous:
-            entries = {
-                word: pron
-                for word, pron in entries.items() if len(pron) == 1
-            }
-        self._entries = entries
-
-    def __len__(self):
-        return len(self._entries)
-
-    def lookup(self, word):
-        '''Returns list of ARPAbet pronunciations of the given word.'''
-        return self._entries.get(word.upper())
-
-
-_alt_re = re.compile(r'\([0-9]+\)')
-
-
-def _parse_cmudict(file):
-    cmudict = {}
-    for line in file:
-        if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
-            parts = line.split('  ')
-            word = re.sub(_alt_re, '', parts[0])
-            pronunciation = _get_pronunciation(parts[1])
-            if pronunciation:
-                if word in cmudict:
-                    cmudict[word].append(pronunciation)
-                else:
-                    cmudict[word] = [pronunciation]
-    return cmudict
-
-
-def _get_pronunciation(s):
-    parts = s.strip().split(' ')
-    for part in parts:
-        if part not in _valid_symbol_set:
-            return None
-    return ' '.join(parts)
--- a/modelscope/models/audio/tts/text/symbols.py
+++ b/modelscope/models/audio/tts/text/symbols.py
@@ -1,105 +0,0 @@
-'''
-Defines the set of symbols used in text input to the model.
-
-The default is a set of ASCII characters that works well for English or text that has been run
-through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
-'''
-import codecs
-import os
-
-_pad = '_'
-_eos = '~'
-_mask = '@[MASK]'
-
-
-def load_symbols(dict_path, has_mask=True):
-    _characters = ''
-    _ch_symbols = []
-    sy_dict_name = 'sy_dict.txt'
-    sy_dict_path = os.path.join(dict_path, sy_dict_name)
-    f = codecs.open(sy_dict_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_symbols.append(line)
-
-    _arpabet = ['@' + s for s in _ch_symbols]
-
-    # Export all symbols:
-    sy = list(_characters) + _arpabet + [_pad, _eos]
-    if has_mask:
-        sy.append(_mask)
-
-    _characters = ''
-
-    _ch_tones = []
-    tone_dict_name = 'tone_dict.txt'
-    tone_dict_path = os.path.join(dict_path, tone_dict_name)
-    f = codecs.open(tone_dict_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_tones.append(line)
-
-    # Export all tones:
-    tone = list(_characters) + _ch_tones + [_pad, _eos]
-    if has_mask:
-        tone.append(_mask)
-
-    _characters = ''
-
-    _ch_syllable_flags = []
-    syllable_flag_name = 'syllable_flag_dict.txt'
-    syllable_flag_path = os.path.join(dict_path, syllable_flag_name)
-    f = codecs.open(syllable_flag_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_syllable_flags.append(line)
-
-    # Export all syllable_flags:
-    syllable_flag = list(_characters) + _ch_syllable_flags + [_pad, _eos]
-    if has_mask:
-        syllable_flag.append(_mask)
-
-    _characters = ''
-
-    _ch_word_segments = []
-    word_segment_name = 'word_segment_dict.txt'
-    word_segment_path = os.path.join(dict_path, word_segment_name)
-    f = codecs.open(word_segment_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_word_segments.append(line)
-
-    # Export all syllable_flags:
-    word_segment = list(_characters) + _ch_word_segments + [_pad, _eos]
-    if has_mask:
-        word_segment.append(_mask)
-
-    _characters = ''
-
-    _ch_emo_types = []
-    emo_category_name = 'emo_category_dict.txt'
-    emo_category_path = os.path.join(dict_path, emo_category_name)
-    f = codecs.open(emo_category_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_emo_types.append(line)
-
-    emo_category = list(_characters) + _ch_emo_types + [_pad, _eos]
-    if has_mask:
-        emo_category.append(_mask)
-
-    _characters = ''
-
-    _ch_speakers = []
-    speaker_name = 'speaker_dict.txt'
-    speaker_path = os.path.join(dict_path, speaker_name)
-    f = codecs.open(speaker_path, 'r')
-    for line in f:
-        line = line.strip('\r\n')
-        _ch_speakers.append(line)
-
-    # Export all syllable_flags:
-    speaker = list(_characters) + _ch_speakers + [_pad, _eos]
-    if has_mask:
-        speaker.append(_mask)
-    return sy, tone, syllable_flag, word_segment, emo_category, speaker
--- a/modelscope/models/audio/tts/text/symbols_dict.py
+++ b/modelscope/models/audio/tts/text/symbols_dict.py
@@ -1,200 +0,0 @@
-import re
-import sys
-
-from .cleaners import (basic_cleaners, english_cleaners,
-                       transliteration_cleaners)
-
-
-class SymbolsDict:
-
-    def __init__(self, sy, tone, syllable_flag, word_segment, emo_category,
-                 speaker, inputs_dim, lfeat_type_list):
-        self._inputs_dim = inputs_dim
-        self._lfeat_type_list = lfeat_type_list
-        self._sy_to_id = {s: i for i, s in enumerate(sy)}
-        self._id_to_sy = {i: s for i, s in enumerate(sy)}
-        self._tone_to_id = {s: i for i, s in enumerate(tone)}
-        self._id_to_tone = {i: s for i, s in enumerate(tone)}
-        self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)}
-        self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)}
-        self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)}
-        self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)}
-        self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)}
-        self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)}
-        self._speaker_to_id = {s: i for i, s in enumerate(speaker)}
-        self._id_to_speaker = {i: s for i, s in enumerate(speaker)}
-        print('_sy_to_id: ')
-        print(self._sy_to_id)
-        print('_tone_to_id: ')
-        print(self._tone_to_id)
-        print('_syllable_flag_to_id: ')
-        print(self._syllable_flag_to_id)
-        print('_word_segment_to_id: ')
-        print(self._word_segment_to_id)
-        print('_emo_category_to_id: ')
-        print(self._emo_category_to_id)
-        print('_speaker_to_id: ')
-        print(self._speaker_to_id)
-        self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
-        self._cleaners = {
-            basic_cleaners.__name__: basic_cleaners,
-            transliteration_cleaners.__name__: transliteration_cleaners,
-            english_cleaners.__name__: english_cleaners
-        }
-
-    def _clean_text(self, text, cleaner_names):
-        for name in cleaner_names:
-            cleaner = self._cleaners.get(name)
-            if not cleaner:
-                raise Exception('Unknown cleaner: %s' % name)
-            text = cleaner(text)
-        return text
-
-    def _sy_to_sequence(self, sy):
-        return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)]
-
-    def _arpabet_to_sequence(self, text):
-        return self._sy_to_sequence(['@' + s for s in text.split()])
-
-    def _should_keep_sy(self, s):
-        return s in self._sy_to_id and s != '_' and s != '~'
-
-    def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names):
-        sequence = []
-        if lfeat_type == 'sy':
-            this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
-            this_lfeat_symbol_format = ''
-            index = 0
-            while index < len(this_lfeat_symbol):
-                this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
-                    index] + '}' + ' '
-                index = index + 1
-            sequence = self.text_to_sequence(this_lfeat_symbol_format,
-                                             cleaner_names)
-        elif lfeat_type == 'tone':
-            sequence = self.tone_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'syllable_flag':
-            sequence = self.syllable_flag_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'word_segment':
-            sequence = self.word_segment_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'emo_category':
-            sequence = self.emo_category_to_sequence(this_lfeat_symbol)
-        elif lfeat_type == 'speaker':
-            sequence = self.speaker_to_sequence(this_lfeat_symbol)
-        else:
-            raise Exception('Unknown lfeat type: %s' % lfeat_type)
-
-        return sequence
-
-    def text_to_sequence(self, text, cleaner_names):
-        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-
-          The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-          in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-
-          Args:
-            text: string to convert to a sequence
-            cleaner_names: names of the cleaner functions to run the text through
-
-          Returns:
-            List of integers corresponding to the symbols in the text
-        '''
-        sequence = []
-
-        # Check for curly braces and treat their contents as ARPAbet:
-        while len(text):
-            m = self._curly_re.match(text)
-            if not m:
-                sequence += self._sy_to_sequence(
-                    self._clean_text(text, cleaner_names))
-                break
-            sequence += self._sy_to_sequence(
-                self._clean_text(m.group(1), cleaner_names))
-            sequence += self._arpabet_to_sequence(m.group(2))
-            text = m.group(3)
-
-        # Append EOS token
-        sequence.append(self._sy_to_id['~'])
-        return sequence
-
-    def tone_to_sequence(self, tone):
-        tones = tone.strip().split(' ')
-        sequence = []
-        for this_tone in tones:
-            sequence.append(self._tone_to_id[this_tone])
-        sequence.append(self._tone_to_id['~'])
-        return sequence
-
-    def syllable_flag_to_sequence(self, syllable_flag):
-        syllable_flags = syllable_flag.strip().split(' ')
-        sequence = []
-        for this_syllable_flag in syllable_flags:
-            sequence.append(self._syllable_flag_to_id[this_syllable_flag])
-        sequence.append(self._syllable_flag_to_id['~'])
-        return sequence
-
-    def word_segment_to_sequence(self, word_segment):
-        word_segments = word_segment.strip().split(' ')
-        sequence = []
-        for this_word_segment in word_segments:
-            sequence.append(self._word_segment_to_id[this_word_segment])
-        sequence.append(self._word_segment_to_id['~'])
-        return sequence
-
-    def emo_category_to_sequence(self, emo_type):
-        emo_categories = emo_type.strip().split(' ')
-        sequence = []
-        for this_category in emo_categories:
-            sequence.append(self._emo_category_to_id[this_category])
-        sequence.append(self._emo_category_to_id['~'])
-        return sequence
-
-    def speaker_to_sequence(self, speaker):
-        speakers = speaker.strip().split(' ')
-        sequence = []
-        for this_speaker in speakers:
-            sequence.append(self._speaker_to_id[this_speaker])
-        sequence.append(self._speaker_to_id['~'])
-        return sequence
-
-    def sequence_to_symbol(self, sequence):
-        result = ''
-        pre_lfeat_dim = 0
-        for lfeat_type in self._lfeat_type_list:
-            current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim
-                                                + self._inputs_dim[lfeat_type]]
-            current_sequence = current_one_hot_sequence.argmax(1)
-            length = current_sequence.shape[0]
-
-            index = 0
-            while index < length:
-                this_sequence = current_sequence[index]
-                s = ''
-                if lfeat_type == 'sy':
-                    s = self._id_to_sy[this_sequence]
-                    if len(s) > 1 and s[0] == '@':
-                        s = s[1:]
-                elif lfeat_type == 'tone':
-                    s = self._id_to_tone[this_sequence]
-                elif lfeat_type == 'syllable_flag':
-                    s = self._id_to_syllable_flag[this_sequence]
-                elif lfeat_type == 'word_segment':
-                    s = self._id_to_word_segment[this_sequence]
-                elif lfeat_type == 'emo_category':
-                    s = self._id_to_emo_category[this_sequence]
-                elif lfeat_type == 'speaker':
-                    s = self._id_to_speaker[this_sequence]
-                else:
-                    raise Exception('Unknown lfeat type: %s' % lfeat_type)
-
-                if index == 0:
-                    result = result + lfeat_type + ': '
-
-                result = result + '{' + s + '}'
-
-                if index == length - 1:
-                    result = result + '; '
-
-                index = index + 1
-            pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type]
-        return result
--- a/modelscope/models/audio/tts/voice.py
+++ b/modelscope/models/audio/tts/voice.py
@@ -1,286 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
+import pickle as pkl

 import json
 import numpy as np
 import torch
-from sklearn.preprocessing import MultiLabelBinarizer

+from modelscope.utils.audio.tts_exceptions import \
+    TtsModelConfigurationException
 from modelscope.utils.constant import ModelFile, Tasks
-from .models import Generator, create_am_model
-from .text.symbols import load_symbols
-from .text.symbols_dict import SymbolsDict
-
-import tensorflow as tf  # isort:skip
+from .models.datasets.units import KanTtsLinguisticUnit
+from .models.models.hifigan import Generator
+from .models.models.sambert import KanTtsSAMBERT
+from .models.utils import (AttrDict, build_env, init_weights, load_checkpoint,
+                           plot_spectrogram, save_checkpoint, scan_checkpoint)

 MAX_WAV_VALUE = 32768.0


-def multi_label_symbol_to_sequence(my_classes, my_symbol):
-    one_hot = MultiLabelBinarizer(classes=my_classes)
-    tokens = my_symbol.strip().split(' ')
-    sequences = []
-    for token in tokens:
-        sequences.append(tuple(token.split('&')))
-    return one_hot.fit_transform(sequences)
-
-
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    return checkpoint_dict
-
-
-class AttrDict(dict):
-
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-
-
 class Voice:

-    def __init__(self, voice_name, voice_path, am_hparams, voc_config):
+    def __init__(self, voice_name, voice_path, am_config, voc_config):
        self.__voice_name = voice_name
        self.__voice_path = voice_path
-        self.__am_hparams = tf.contrib.training.HParams(**am_hparams)
+        self.__am_config = AttrDict(**am_config)
        self.__voc_config = AttrDict(**voc_config)
        self.__model_loaded = False
+        if 'am' not in self.__am_config:
+            raise TtsModelConfigurationException(
+                'modelscope error: am configuration invalid')
+        if 'linguistic_unit' not in self.__am_config:
+            raise TtsModelConfigurationException(
+                'modelscope error: am configuration invalid')
+        self.__am_lingustic_unit_config = self.__am_config['linguistic_unit']

    def __load_am(self):
-        local_am_ckpt_path = os.path.join(self.__voice_path,
-                                          ModelFile.TF_CHECKPOINT_FOLDER)
-        self.__am_ckpt_path = os.path.join(local_am_ckpt_path, 'ckpt')
-        self.__dict_path = os.path.join(self.__voice_path, 'dicts')
+        local_am_ckpt_path = os.path.join(self.__voice_path, 'am')
+        self.__am_ckpt_path = os.path.join(local_am_ckpt_path,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
        has_mask = True
-        if self.__am_hparams.get('has_mask') is not None:
-            has_mask = self.__am_hparams.has_mask
-        model_name = 'robutrans'
-        self.__lfeat_type_list = self.__am_hparams.lfeat_type_list.strip(
-        ).split(',')
-        sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols(
-            self.__dict_path, has_mask)
-        self.__sy = sy
-        self.__tone = tone
-        self.__syllable_flag = syllable_flag
-        self.__word_segment = word_segment
-        self.__emo_category = emo_category
-        self.__speaker = speaker
-        self.__inputs_dim = dict()
-        for lfeat_type in self.__lfeat_type_list:
-            if lfeat_type == 'sy':
-                self.__inputs_dim[lfeat_type] = len(sy)
-            elif lfeat_type == 'tone':
-                self.__inputs_dim[lfeat_type] = len(tone)
-            elif lfeat_type == 'syllable_flag':
-                self.__inputs_dim[lfeat_type] = len(syllable_flag)
-            elif lfeat_type == 'word_segment':
-                self.__inputs_dim[lfeat_type] = len(word_segment)
-            elif lfeat_type == 'emo_category':
-                self.__inputs_dim[lfeat_type] = len(emo_category)
-            elif lfeat_type == 'speaker':
-                self.__inputs_dim[lfeat_type] = len(speaker)
-
-        self.__symbols_dict = SymbolsDict(sy, tone, syllable_flag,
-                                          word_segment, emo_category, speaker,
-                                          self.__inputs_dim,
-                                          self.__lfeat_type_list)
-        dim_inputs = sum(self.__inputs_dim.values(
-        )) - self.__inputs_dim['speaker'] - self.__inputs_dim['emo_category']
-        self.__graph = tf.Graph()
-        with self.__graph.as_default():
-            inputs = tf.placeholder(tf.float32, [1, None, dim_inputs],
-                                    'inputs')
-            inputs_emotion = tf.placeholder(
-                tf.float32, [1, None, self.__inputs_dim['emo_category']],
-                'inputs_emotion')
-            inputs_speaker = tf.placeholder(
-                tf.float32, [1, None, self.__inputs_dim['speaker']],
-                'inputs_speaker')
-            input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
-            pitch_contours_scale = tf.placeholder(tf.float32, [1, None],
-                                                  'pitch_contours_scale')
-            energy_contours_scale = tf.placeholder(tf.float32, [1, None],
-                                                   'energy_contours_scale')
-            duration_scale = tf.placeholder(tf.float32, [1, None],
-                                            'duration_scale')
-            with tf.variable_scope('model') as _:
-                self.__model = create_am_model(model_name, self.__am_hparams)
-                self.__model.initialize(
-                    inputs,
-                    inputs_emotion,
-                    inputs_speaker,
-                    input_lengths,
-                    duration_scales=duration_scale,
-                    pitch_scales=pitch_contours_scale,
-                    energy_scales=energy_contours_scale)
-                self.__mel_spec = self.__model.mel_outputs[0]
-                self.__duration_outputs = self.__model.duration_outputs[0]
-                self.__duration_outputs_ = self.__model.duration_outputs_[0]
-                self.__pitch_contour_outputs = self.__model.pitch_contour_outputs[
-                    0]
-                self.__energy_contour_outputs = self.__model.energy_contour_outputs[
-                    0]
-                self.__embedded_inputs_emotion = self.__model.embedded_inputs_emotion[
-                    0]
-                self.__embedding_fsmn_outputs = self.__model.embedding_fsmn_outputs[
-                    0]
-                self.__encoder_outputs = self.__model.encoder_outputs[0]
-                self.__pitch_embeddings = self.__model.pitch_embeddings[0]
-                self.__energy_embeddings = self.__model.energy_embeddings[0]
-                self.__LR_outputs = self.__model.LR_outputs[0]
-                self.__postnet_fsmn_outputs = self.__model.postnet_fsmn_outputs[
-                    0]
-                self.__attention_h = self.__model.attention_h
-                self.__attention_x = self.__model.attention_x
-
-                config = tf.ConfigProto()
-                config.gpu_options.allow_growth = True
-                self.__session = tf.Session(config=config)
-                self.__session.run(tf.global_variables_initializer())
-
-                saver = tf.train.Saver()
-                saver.restore(self.__session, self.__am_ckpt_path)
+        if 'has_mask' in self.__am_lingustic_unit_config:
+            has_mask = self.__am_lingustic_unit_config.has_mask
+        self.__ling_unit = KanTtsLinguisticUnit(
+            self.__am_lingustic_unit_config, self.__voice_path, has_mask)
+        self.__am_net = KanTtsSAMBERT(self.__am_config,
+                                      self.__ling_unit.get_unit_size()).to(
+                                          self.__device)
+        state_dict_g = {}
+        try:
+            state_dict_g = load_checkpoint(self.__am_ckpt_path, self.__device)
+        except RuntimeError:
+            with open(self.__am_ckpt_path, 'rb') as f:
+                pth_var_dict = pkl.load(f)
+                state_dict_g['fsnet'] = {
+                    k: torch.FloatTensor(v)
+                    for k, v in pth_var_dict['fsnet'].items()
+                }
+        self.__am_net.load_state_dict(state_dict_g['fsnet'], strict=False)
+        self.__am_net.eval()

    def __load_vocoder(self):
-        self.__voc_ckpt_path = os.path.join(self.__voice_path,
+        local_voc_ckpy_path = os.path.join(self.__voice_path, 'vocoder')
+        self.__voc_ckpt_path = os.path.join(local_voc_ckpy_path,
                                            ModelFile.TORCH_MODEL_BIN_FILE)
-        if torch.cuda.is_available():
-            torch.manual_seed(self.__voc_config.seed)
-            self.__device = torch.device('cuda')
-        else:
-            self.__device = torch.device('cpu')
        self.__generator = Generator(self.__voc_config).to(self.__device)
        state_dict_g = load_checkpoint(self.__voc_ckpt_path, self.__device)
        self.__generator.load_state_dict(state_dict_g['generator'])
        self.__generator.eval()
        self.__generator.remove_weight_norm()

-    def __am_forward(self,
-                     text,
-                     pitch_control_str='',
-                     duration_control_str='',
-                     energy_control_str=''):
-        duration_cfg_lst = []
-        if len(duration_control_str) != 0:
-            for item in duration_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                duration_cfg_lst.append((float(percent), float(scale)))
-        pitch_contours_cfg_lst = []
-        if len(pitch_control_str) != 0:
-            for item in pitch_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                pitch_contours_cfg_lst.append((float(percent), float(scale)))
-        energy_contours_cfg_lst = []
-        if len(energy_control_str) != 0:
-            for item in energy_control_str.strip().split('|'):
-                percent, scale = item.lstrip('(').rstrip(')').split(',')
-                energy_contours_cfg_lst.append((float(percent), float(scale)))
-        cleaner_names = [
-            x.strip() for x in self.__am_hparams.cleaners.split(',')
-        ]
-
-        lfeat_symbol = text.strip().split(' ')
-        lfeat_symbol_separate = [''] * int(len(self.__lfeat_type_list))
-        for this_lfeat_symbol in lfeat_symbol:
-            this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
-                '$')
-            if len(this_lfeat_symbol) != len(self.__lfeat_type_list):
-                raise Exception(
-                    'Length of this_lfeat_symbol in training data'
-                    + ' is not equal to the length of lfeat_type_list, '
-                    + str(len(this_lfeat_symbol)) + ' VS. '
-                    + str(len(self.__lfeat_type_list)))
-            index = 0
-            while index < len(lfeat_symbol_separate):
-                lfeat_symbol_separate[index] = lfeat_symbol_separate[
-                    index] + this_lfeat_symbol[index] + ' '
-                index = index + 1
-
-        index = 0
-        lfeat_type = self.__lfeat_type_list[index]
-        sequence = self.__symbols_dict.symbol_to_sequence(
-            lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names)
-        sequence_array = np.asarray(
-            sequence[:-1],
-            dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
-        inputs = np.eye(
-            self.__inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
-        index = index + 1
-        while index < len(self.__lfeat_type_list) - 2:
-            lfeat_type = self.__lfeat_type_list[index]
-            sequence = self.__symbols_dict.symbol_to_sequence(
-                lfeat_symbol_separate[index].strip(), lfeat_type,
-                cleaner_names)
-            sequence_array = np.asarray(
-                sequence[:-1],
-                dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
-            inputs_temp = np.eye(
-                self.__inputs_dim[lfeat_type],
-                dtype=np.float32)[sequence_array]
-            inputs = np.concatenate((inputs, inputs_temp), axis=1)
-            index = index + 1
-        seq = inputs
-
-        lfeat_type = 'emo_category'
-        inputs_emotion = multi_label_symbol_to_sequence(
-            self.__emo_category, lfeat_symbol_separate[index].strip())
-        # inputs_emotion = inputs_emotion * 1.5
-        index = index + 1
-
-        lfeat_type = 'speaker'
-        inputs_speaker = multi_label_symbol_to_sequence(
-            self.__speaker, lfeat_symbol_separate[index].strip())
-
-        duration_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in duration_cfg_lst:
-            duration_scale[start_idx:start_idx
-                           + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in pitch_contours_cfg_lst:
-            pitch_contours_scale[start_idx:start_idx
-                                 + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        energy_contours_scale = np.ones((len(seq), ), dtype=np.float32)
-        start_idx = 0
-        for (percent, scale) in energy_contours_cfg_lst:
-            energy_contours_scale[start_idx:start_idx
-                                  + int(percent * len(seq))] = scale
-            start_idx += int(percent * len(seq))
-
-        feed_dict = {
-            self.__model.inputs: [np.asarray(seq, dtype=np.float32)],
-            self.__model.inputs_emotion:
-            [np.asarray(inputs_emotion, dtype=np.float32)],
-            self.__model.inputs_speaker:
-            [np.asarray(inputs_speaker, dtype=np.float32)],
-            self.__model.input_lengths:
-            np.asarray([len(seq)], dtype=np.int32),
-            self.__model.duration_scales: [duration_scale],
-            self.__model.pitch_scales: [pitch_contours_scale],
-            self.__model.energy_scales: [energy_contours_scale]
-        }
-
-        result = self.__session.run([
-            self.__mel_spec, self.__duration_outputs, self.__duration_outputs_,
-            self.__pitch_contour_outputs, self.__embedded_inputs_emotion,
-            self.__embedding_fsmn_outputs, self.__encoder_outputs,
-            self.__pitch_embeddings, self.__LR_outputs,
-            self.__postnet_fsmn_outputs, self.__energy_contour_outputs,
-            self.__energy_embeddings, self.__attention_x, self.__attention_h
-        ], feed_dict=feed_dict)  # yapf:disable
-        return result[0]
+    def __am_forward(self, symbol_seq):
+        with torch.no_grad():
+            inputs_feat_lst = self.__ling_unit.encode_symbol_sequence(
+                symbol_seq)
+            inputs_sy = torch.from_numpy(inputs_feat_lst[0]).long().to(
+                self.__device)
+            inputs_tone = torch.from_numpy(inputs_feat_lst[1]).long().to(
+                self.__device)
+            inputs_syllable = torch.from_numpy(inputs_feat_lst[2]).long().to(
+                self.__device)
+            inputs_ws = torch.from_numpy(inputs_feat_lst[3]).long().to(
+                self.__device)
+            inputs_ling = torch.stack(
+                [inputs_sy, inputs_tone, inputs_syllable, inputs_ws],
+                dim=-1).unsqueeze(0)
+            inputs_emo = torch.from_numpy(inputs_feat_lst[4]).long().to(
+                self.__device).unsqueeze(0)
+            inputs_spk = torch.from_numpy(inputs_feat_lst[5]).long().to(
+                self.__device).unsqueeze(0)
+            inputs_len = torch.zeros(1).to(self.__device).long(
+            ) + inputs_emo.size(1) - 1  # minus 1 for "~"
+            res = self.__am_net(inputs_ling[:, :-1, :], inputs_emo[:, :-1],
+                                inputs_spk[:, :-1], inputs_len)
+            postnet_outputs = res['postnet_outputs']
+            LR_length_rounded = res['LR_length_rounded']
+            valid_length = int(LR_length_rounded[0].item())
+            postnet_outputs = postnet_outputs[
+                0, :valid_length, :].cpu().numpy()
+            return postnet_outputs

    def __vocoder_forward(self, melspec):
        dim0 = list(melspec.shape)[-1]
        if dim0 != self.__voc_config.num_mels:
            raise TtsVocoderMelspecShapeMismatchException(
-                'input melspec mismatch require {} but {}'.format(
-                    self.__voc_config.num_mels, dim0))
+                'modelscope error: input melspec mismatch require {} but {}'.
+                format(self.__voc_config.num_mels, dim0))
        with torch.no_grad():
            x = melspec.T
            x = torch.FloatTensor(x).to(self.__device)
@@ -292,9 +117,15 @@ class Voice:
            audio = audio.cpu().numpy().astype('int16')
            return audio

-    def forward(self, text):
+    def forward(self, symbol_seq):
        if not self.__model_loaded:
+            torch.manual_seed(self.__am_config.seed)
+            if torch.cuda.is_available():
+                torch.manual_seed(self.__am_config.seed)
+                self.__device = torch.device('cuda')
+            else:
+                self.__device = torch.device('cpu')
            self.__load_am()
            self.__load_vocoder()
            self.__model_loaded = True
-        return self.__vocoder_forward(self.__am_forward(text))
+        return self.__vocoder_forward(self.__am_forward(symbol_seq))
--- a/modelscope/pipelines/audio/text_to_speech_pipeline.py
+++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import Any, Dict, List

 import numpy as np
@@ -42,3 +44,6 @@ class TextToSpeechSambertHifiganPipeline(Pipeline):

    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
        return inputs
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {}, pipeline_parameters, {}
--- a/modelscope/utils/audio/tts_exceptions.py
+++ b/modelscope/utils/audio/tts_exceptions.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 """
 Define TTS exceptions
 """
@@ -10,7 +11,7 @@ class TtsException(Exception):
    pass


-class TtsModelConfigurationExcetion(TtsException):
+class TtsModelConfigurationException(TtsException):
    """
    TTS model configuration exceptions.
    """
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,6 +1,5 @@
 easyasr>=0.0.2
 espnet>=202204
-#tts
 h5py
 inflect
 keras
@@ -15,11 +14,7 @@ nltk
 numpy<=1.18
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>3,<3.21.0
-ptflops
 py_sound_connect
-pytorch_wavelets
-PyWavelets>=1.0.0
-scikit-learn
 SoundFile>0.10
 sox
 torchaudio
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -9,6 +9,7 @@ import unittest
 import torch
 from scipy.io.wavfile import write

+from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -33,7 +34,9 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
        text = '今天北京天气怎么样？'
        voice = 'zhitian_emo'

-        sambert_hifigan_tts = pipeline(task=self.task, model=self.model_id)
+        model = Model.from_pretrained(
+            model_name_or_path=self.model_id, revision='pytorch_am')
+        sambert_hifigan_tts = pipeline(task=self.task, model=model)
        self.assertTrue(sambert_hifigan_tts is not None)
        output = sambert_hifigan_tts(input=text, voice=voice)
        self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])