From 047904ef73d42eccb33328089642ae6ffe20318d Mon Sep 17 00:00:00 2001 From: myf272609 Date: Mon, 26 Sep 2022 11:55:06 +0800 Subject: [PATCH 01/23] [to #42322933] fix init issues for multi-style cartoon models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 修复多风格模型pipeline初始化问题 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10249429 --- modelscope/pipelines/cv/image_cartoon_pipeline.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py index f34be618..72fda989 100644 --- a/modelscope/pipelines/cv/image_cartoon_pipeline.py +++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py @@ -39,10 +39,13 @@ class ImageCartoonPipeline(Pipeline): super().__init__(model=model, **kwargs) with device_placement(self.framework, self.device_name): self.facer = FaceAna(self.model) - self.sess_anime_head = self.load_sess( - os.path.join(self.model, 'cartoon_h.pb'), 'model_anime_head') - self.sess_anime_bg = self.load_sess( - os.path.join(self.model, 'cartoon_bg.pb'), 'model_anime_bg') + with tf.Graph().as_default(): + self.sess_anime_head = self.load_sess( + os.path.join(self.model, 'cartoon_h.pb'), + 'model_anime_head') + self.sess_anime_bg = self.load_sess( + os.path.join(self.model, 'cartoon_bg.pb'), + 'model_anime_bg') self.box_width = 288 global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg')) From 5e4894870bf56585f294f24bc485d97ab1420e4e Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Mon, 26 Sep 2022 12:23:28 +0800 Subject: [PATCH 02/23] [to #42322933]add t5 model / text2text generation task Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10191736 * add T5 for generation --- modelscope/metainfo.py | 3 + modelscope/models/nlp/T5/__init__.py | 21 + modelscope/models/nlp/T5/configuration_t5.py | 174 ++ modelscope/models/nlp/T5/modeling_t5.py | 2003 +++++++++++++++++ .../models/nlp/T5/t5_for_text_generation.py | 56 + modelscope/models/nlp/__init__.py | 3 +- modelscope/outputs.py | 7 + modelscope/pipelines/nlp/__init__.py | 4 +- .../nlp/text2text_generation_pipeline.py | 87 + modelscope/preprocessors/__init__.py | 3 +- modelscope/preprocessors/nlp/__init__.py | 2 + modelscope/preprocessors/nlp/nlp_base.py | 35 + modelscope/utils/constant.py | 1 + tests/pipelines/test_text2text_generation.py | 61 + 14 files changed, 2457 insertions(+), 3 deletions(-) create mode 100644 modelscope/models/nlp/T5/__init__.py create mode 100644 modelscope/models/nlp/T5/configuration_t5.py create mode 100644 modelscope/models/nlp/T5/modeling_t5.py create mode 100644 modelscope/models/nlp/T5/t5_for_text_generation.py create mode 100644 modelscope/pipelines/nlp/text2text_generation_pipeline.py create mode 100644 tests/pipelines/test_text2text_generation.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 80a522b2..29a35fbe 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -65,6 +65,7 @@ class Models(object): plug = 'plug' bert_for_ds = 'bert-for-document-segmentation' ponet = 'ponet' + T5 = 'T5' # audio models sambert_hifigan = 'sambert-hifigan' @@ -179,6 +180,7 @@ class Pipelines(object): part_of_speech = 'part-of-speech' named_entity_recognition = 'named-entity-recognition' text_generation = 'text-generation' + text2text_generation = 'text2text-generation' sentiment_analysis = 'sentiment-analysis' sentiment_classification = 'sentiment-classification' text_classification = 'text-classification' @@ -280,6 +282,7 @@ class Preprocessors(object): cross_encoder_tokenizer = 'cross-encoder-tokenizer' bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer' text_gen_tokenizer = 'text-gen-tokenizer' + text2text_gen_preprocessor = 'text2text-gen-preprocessor' token_cls_tokenizer = 'token-cls-tokenizer' ner_tokenizer = 'ner-tokenizer' nli_tokenizer = 'nli-tokenizer' diff --git a/modelscope/models/nlp/T5/__init__.py b/modelscope/models/nlp/T5/__init__.py new file mode 100644 index 00000000..7c1cea36 --- /dev/null +++ b/modelscope/models/nlp/T5/__init__.py @@ -0,0 +1,21 @@ +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .t5_for_text_generation import T5ForConditionalGeneration + +else: + _import_structure = { + 't5_for_text_generation': ['T5ForConditionalGeneration'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/T5/configuration_t5.py b/modelscope/models/nlp/T5/configuration_t5.py new file mode 100644 index 00000000..117a6bc1 --- /dev/null +++ b/modelscope/models/nlp/T5/configuration_t5.py @@ -0,0 +1,174 @@ +# Copyright 2020, The T5 Authors and HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" T5 model configuration""" +from typing import Mapping + +from transformers.configuration_utils import PretrainedConfig +from transformers.onnx import OnnxSeq2SeqConfigWithPast + +from modelscope.utils.logger import get_logger + +logger = get_logger(__name__) + + +class T5Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`T5Model`] or a [`TFT5Model`]. It is used to + instantiate a T5 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the T5 + [t5-small](https://huggingface.co/t5-small) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Arguments: + vocab_size (`int`, *optional*, defaults to 32128): + Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`]. + d_model (`int`, *optional*, defaults to 512): + Size of the encoder layers and the pooler layer. + d_kv (`int`, *optional*, defaults to 64): + Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // + num_heads`. + d_ff (`int`, *optional*, defaults to 2048): + Size of the intermediate feed forward layer in each `T5Block`. + num_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + num_decoder_layers (`int`, *optional*): + Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set. + num_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + relative_attention_num_buckets (`int`, *optional*, defaults to 32): + The number of buckets to use for each attention layer. + relative_attention_max_distance (`int`, *optional*, defaults to 128): + The maximum distance of the longer sequences for the bucket separation. + dropout_rate (`float`, *optional*, defaults to 0.1): + The ratio for all dropout layers. + layer_norm_eps (`float`, *optional*, defaults to 1e-6): + The epsilon used by the layer normalization layers. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + feed_forward_proj (`string`, *optional*, defaults to `"relu"`): + Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. T5v1.1 uses the + `"gated-gelu"` feed forward projection. Original T5 uses `"relu"`. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + """ + model_type = 't5' + keys_to_ignore_at_inference = ['past_key_values'] + attribute_map = { + 'hidden_size': 'd_model', + 'num_attention_heads': 'num_heads', + 'num_hidden_layers': 'num_layers' + } + + def __init__(self, + vocab_size=32128, + d_model=512, + d_kv=64, + d_ff=2048, + num_layers=6, + num_decoder_layers=None, + num_heads=8, + relative_attention_num_buckets=32, + relative_attention_max_distance=128, + dropout_rate=0.1, + layer_norm_epsilon=1e-6, + initializer_factor=1.0, + feed_forward_proj='relu', + is_encoder_decoder=True, + use_cache=True, + pad_token_id=0, + eos_token_id=1, + **kwargs): + self.vocab_size = vocab_size + self.d_model = d_model + self.d_kv = d_kv + self.d_ff = d_ff + self.num_layers = num_layers + self.num_decoder_layers = (num_decoder_layers if num_decoder_layers + is not None else self.num_layers + ) # default = symmetry + self.num_heads = num_heads + self.relative_attention_num_buckets = relative_attention_num_buckets + self.relative_attention_max_distance = relative_attention_max_distance + self.dropout_rate = dropout_rate + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_factor = initializer_factor + self.feed_forward_proj = feed_forward_proj + self.use_cache = use_cache + + act_info = self.feed_forward_proj.split('-') + self.dense_act_fn = act_info[-1] + self.is_gated_act = act_info[0] == 'gated' + + if len(act_info) > 1 and act_info[0] != 'gated' or len(act_info) > 2: + raise ValueError( + f'`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer.' + 'Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. ' + "'gated-gelu' or 'relu'") + + # for backwards compatibility + if feed_forward_proj == 'gated-gelu': + self.dense_act_fn = 'gelu_new' + + super().__init__( + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + + +class T5OnnxConfig(OnnxSeq2SeqConfigWithPast): + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = { + 'input_ids': { + 0: 'batch', + 1: 'encoder_sequence' + }, + 'attention_mask': { + 0: 'batch', + 1: 'encoder_sequence' + }, + } + if self.use_past: + common_inputs['attention_mask'][ + 1] = 'past_encoder_sequence + sequence' + common_inputs['decoder_input_ids'] = {0: 'batch'} + common_inputs['decoder_attention_mask'] = { + 0: 'batch', + 1: 'past_decoder_sequence + sequence' + } + else: + common_inputs['decoder_input_ids'] = { + 0: 'batch', + 1: 'decoder_sequence' + } + common_inputs['decoder_attention_mask'] = { + 0: 'batch', + 1: 'decoder_sequence' + } + + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction='inputs') + + return common_inputs + + @property + def default_onnx_opset(self) -> int: + return 13 diff --git a/modelscope/models/nlp/T5/modeling_t5.py b/modelscope/models/nlp/T5/modeling_t5.py new file mode 100644 index 00000000..da50741e --- /dev/null +++ b/modelscope/models/nlp/T5/modeling_t5.py @@ -0,0 +1,2003 @@ +# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch T5 model.""" + +import copy +import math +import os +import warnings +from typing import Optional, Tuple, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss +from torch.utils.checkpoint import checkpoint +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, Seq2SeqModelOutput) +from transformers.modeling_utils import (PreTrainedModel, + find_pruneable_heads_and_indices, + prune_linear_layer) +from transformers.utils import (DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, + add_start_docstrings_to_model_forward, + is_torch_fx_proxy, replace_return_docstrings) +from transformers.utils.model_parallel_utils import (assert_device_map, + get_device_map) + +from modelscope.utils.logger import get_logger +from .configuration_t5 import T5Config + +logger = get_logger(__name__) + +_CONFIG_FOR_DOC = 'T5Config' +_TOKENIZER_FOR_DOC = 'T5Tokenizer' +_CHECKPOINT_FOR_DOC = 't5-small' + +#################################################### +# This dict contains ids and associated url +# for the pretrained weights provided with the models +#################################################### +T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ + 't5-small', + 't5-base', + 't5-large', + 't5-3b', + 't5-11b', + # See all T5 models at https://huggingface.co/models?filter=t5 +] + + +#################################################### +# This is a conversion method from TF 1.0 to PyTorch +# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 +#################################################### +def load_tf_weights_in_t5(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + 'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see ' + 'https://www.tensorflow.org/install/ for installation instructions.' + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f'Converting TensorFlow checkpoint from {tf_path}') + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + tf_weights = {} + for name, shape in init_vars: + logger.info(f'Loading TF weight {name} with shape {shape}') + array = tf.train.load_variable(tf_path, name) + names.append(name) + tf_weights[name] = array + + for txt_name in names: + name = txt_name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in [ + 'adam_v', 'adam_m', 'AdamWeightDecayOptimizer', + 'AdamWeightDecayOptimizer_1', 'global_step' + ] for n in name): + logger.info(f"Skipping {'/'.join(name)}") + tf_weights.pop(txt_name, None) + continue + if '_slot_' in name[-1]: + logger.info(f"Skipping {'/'.join(name)}") + tf_weights.pop(txt_name, None) + continue + pointer = model + array = tf_weights[txt_name] + + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + scope_names = re.split(r'_(\d+)', m_name) + else: + scope_names = [m_name] + if scope_names[0] in ['kernel', 'scale', 'embedding']: + pointer = getattr(pointer, 'weight') + elif scope_names[0] == 'self_attention': + pointer = getattr(pointer, 'layer') + pointer = pointer[0] + elif scope_names[0] == 'enc_dec_attention': + pointer = getattr(pointer, 'layer') + pointer = pointer[1] + elif scope_names[0] == 'dense_relu_dense': + pointer = getattr(pointer, 'layer') + pointer = pointer[2] + elif scope_names[0] == 'rms_norm': + if hasattr(pointer, 'layer_norm'): + pointer = getattr(pointer, 'layer_norm') + elif hasattr(pointer, 'final_layer_norm'): + pointer = getattr(pointer, 'final_layer_norm') + elif scope_names[0] == 'scale': + pointer = getattr(pointer, 'weight') + elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif scope_names[0] == 'squad': + pointer = getattr(pointer, 'classifier') + elif scope_names[0] == 'decoder' and name[1] == 'logits': + continue + elif scope_names[0] == 'logits': + pointer = getattr(pointer, 'lm_head') + elif scope_names[0] == 'wi' and len( + scope_names) > 1 and scope_names[1].isdigit(): + pointer = getattr(pointer, f'wi_{scope_names[1]}') + continue + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if scope_names[0] not in ['kernel', 'scale', 'embedding']: + pointer = getattr(pointer, 'weight') + if scope_names[0] != 'embedding': + logger.info( + f'Transposing numpy weight of shape {array.shape} for {name}') + array = np.transpose(array) + try: + assert ( + pointer.shape == array.shape + ), f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched' + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f'Initialize PyTorch weight {name}') + pointer.data = torch.from_numpy(array.astype(np.float32)) + tf_weights.pop(txt_name, None) + + logger.info( + f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}." + ) + return model + + +#################################################### +# PyTorch Models are constructed by sub-classing +# - torch.nn.Module for the layers and +# - PreTrainedModel for the models (it-self a sub-class of nn.Module) +#################################################### +PARALLELIZE_DOCSTRING = r""" + This is an experimental feature and is a subject to change at a moment's notice. + + Uses a device map to distribute attention modules of the model across several devices. If no device map is given, + it will evenly distribute blocks across all devices. + + Args: + device_map (`Dict[int, list]`, optional, defaults to None): + A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always + automatically mapped to the first device (for esoteric reasons). That means that the first device should + have fewer attention modules mapped to it than other devices. For reference, the t5 models have the + following number of attention modules: + + - t5-small: 6 + - t5-base: 12 + - t5-large: 24 + - t5-3b: 24 + - t5-11b: 24 + + Example: + + ```python + # Here is an example of a device map on a machine with 4 GPUs + # using t5-3b, which has a total of 24 attention modules: + model = T5ForConditionalGeneration.from_pretrained("t5-3b") + device_map = { + 0: [0, 1, 2], + 1: [3, 4, 5, 6, 7, 8, 9], + 2: [10, 11, 12, 13, 14, 15, 16], + 3: [17, 18, 19, 20, 21, 22, 23], + } + model.parallelize(device_map) + ``` +""" +DEPARALLELIZE_DOCSTRING = r""" + Moves the model to cpu from a model parallel state. + + Example: + + ```python + # On a 4 GPU machine with t5-3b: + model = T5ForConditionalGeneration.from_pretrained("t5-3b") + device_map = { + 0: [0, 1, 2], + 1: [3, 4, 5, 6, 7, 8, 9], + 2: [10, 11, 12, 13, 14, 15, 16], + 3: [17, 18, 19, 20, 21, 22, 23], + } + model.parallelize(device_map) # Splits the model across several devices + model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache() + ``` +""" + + +class T5LayerNorm(nn.Module): + + def __init__(self, hidden_size, eps=1e-6): + """ + Construct a layernorm module in the T5 style. No bias and no subtraction of mean. + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + + # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for + # half-precision inputs is done in fp32 + + variance = hidden_states.to(torch.float32).pow(2).mean( + -1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +try: + from apex.normalization import FusedRMSNorm + + T5LayerNorm = FusedRMSNorm # noqa + + logger.info( + 'Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm' + ) +except ImportError: + # using the normal T5LayerNorm + pass +except Exception: + logger.warning( + 'discovered apex but it failed to load, falling back to T5LayerNorm') + pass + + +class T5DenseReluDense(nn.Module): + + def __init__(self, config: T5Config): + super().__init__() + self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward(self, hidden_states): + hidden_states = self.wi(hidden_states) + hidden_states = nn.functional.relu(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + + +class T5DenseGatedGeluDense(nn.Module): + + def __init__(self, config: T5Config): + super().__init__() + self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.gelu_act = ACT2FN['gelu_new'] + + def forward(self, hidden_states): + hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + + +class T5LayerFF(nn.Module): + + def __init__(self, config: T5Config): + super().__init__() + if config.feed_forward_proj == 'relu': + self.DenseReluDense = T5DenseReluDense(config) + elif config.feed_forward_proj == 'gated-gelu': + self.DenseReluDense = T5DenseGatedGeluDense(config) + else: + raise ValueError( + f'{self.config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`' + ) + + self.layer_norm = T5LayerNorm( + config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward(self, hidden_states): + forwarded_states = self.layer_norm(hidden_states) + forwarded_states = self.DenseReluDense(forwarded_states) + hidden_states = hidden_states + self.dropout(forwarded_states) + return hidden_states + + +class T5Attention(nn.Module): + + def __init__(self, config: T5Config, has_relative_attention_bias=False): + super().__init__() + self.is_decoder = config.is_decoder + self.has_relative_attention_bias = has_relative_attention_bias + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.relative_attention_max_distance = config.relative_attention_max_distance + self.d_model = config.d_model + self.key_value_proj_dim = config.d_kv + self.n_heads = config.num_heads + self.dropout = config.dropout_rate + self.inner_dim = self.n_heads * self.key_value_proj_dim + + # Mesh TensorFlow initialization to avoid scaling before softmax + self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) + + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding( + self.relative_attention_num_buckets, self.n_heads) + self.pruned_heads = set() + self.gradient_checkpointing = False + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads) + # Prune linear layers + self.q = prune_linear_layer(self.q, index) + self.k = prune_linear_layer(self.k, index) + self.v = prune_linear_layer(self.v, index) + self.o = prune_linear_layer(self.o, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.inner_dim = self.key_value_proj_dim * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @staticmethod + def _relative_position_bucket(relative_position, + bidirectional=True, + num_buckets=32, + max_distance=128): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + + Translate relative position to a bucket number for relative attention. The relative position is defined as + memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for + small absolute relative_position and larger buckets for larger absolute relative_positions. All relative + positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. + This should allow for more graceful generalization to longer sequences than the model has been trained on + + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + + Returns: + a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) + """ + relative_buckets = 0 + if bidirectional: + num_buckets //= 2 + relative_buckets += (relative_position > 0).to( + torch.long) * num_buckets + relative_position = torch.abs(relative_position) + else: + relative_position = -torch.min(relative_position, + torch.zeros_like(relative_position)) + # now relative_position is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = relative_position < max_exact + + # The other half of the buckets are for logarithmically bigger bins in + # positions up to max_distance + relateive_pos_log = torch.log(relative_position.float() / max_exact) + max_dis_log = math.log(max_distance / max_exact) + origin_relative_position = relateive_pos_log / max_dis_log * ( + num_buckets - max_exact) + relative_postion_if_large = max_exact + origin_relative_position.to( + torch.long) + relative_postion_if_large = torch.min( + relative_postion_if_large, + torch.full_like(relative_postion_if_large, num_buckets - 1)) + + relative_buckets += torch.where(is_small, relative_position, + relative_postion_if_large) + return relative_buckets + + def compute_bias(self, query_length, key_length): + """Compute binned relative position bias""" + context_position = torch.arange( + query_length, + dtype=torch.long, + device=self.relative_attention_bias.weight.device)[:, None] + memory_position = torch.arange( + key_length, + dtype=torch.long, + device=self.relative_attention_bias.weight.device)[None, :] + relative_position = memory_position - context_position # shape (query_length, key_length) + relative_position_bucket = self._relative_position_bucket( + relative_position, # shape (query_length, key_length) + bidirectional=(not self.is_decoder), + num_buckets=self.relative_attention_num_buckets, + max_distance=self.relative_attention_max_distance, + ) + values = self.relative_attention_bias( + relative_position_bucket + ) # shape (query_length, key_length, num_heads) + values = values.permute([2, 0, 1]).unsqueeze( + 0) # shape (1, num_heads, query_length, key_length) + return values + + def forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + ): + """ + Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). + """ + # Input is (batch_size, seq_length, dim) + # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) + # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + batch_size, seq_length = hidden_states.shape[:2] + + real_seq_length = seq_length + + if past_key_value is not None: + assert ( + len(past_key_value) == 2 + ), f'past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states' + real_seq_length += past_key_value[0].shape[ + 2] if query_length is None else query_length + + key_length = real_seq_length if key_value_states is None else key_value_states.shape[ + 1] + + def shape(states): + """projection""" + return states.view(batch_size, -1, self.n_heads, + self.key_value_proj_dim).transpose(1, 2) + + def unshape(states): + """reshape""" + return states.transpose(1, 2).contiguous().view( + batch_size, -1, self.inner_dim) + + def project(hidden_states, proj_layer, key_value_states, + past_key_value): + """projects hidden states correctly to key/query states""" + if key_value_states is None: + # self-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(hidden_states)) + elif past_key_value is None: + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(key_value_states)) + + if past_key_value is not None: + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) + hidden_states = torch.cat([past_key_value, hidden_states], + dim=2) + else: + # cross-attn + hidden_states = past_key_value + return hidden_states + + # get query states + query_states = shape(self.q( + hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) + + # get key/value states + key_states = project( + hidden_states, self.k, key_value_states, + past_key_value[0] if past_key_value is not None else None) + value_states = project( + hidden_states, self.v, key_value_states, + past_key_value[1] if past_key_value is not None else None) + + # compute scores + scores = torch.matmul( + query_states, key_states.transpose(3, 2) + ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, real_seq_length, key_length), + device=scores.device, + dtype=scores.dtype) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias(real_seq_length, key_length) + + # if key and values are already calculated + # we want only the last query position bias + if past_key_value is not None: + position_bias = position_bias[:, :, -hidden_states.size(1):, :] + + if mask is not None: + position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + + scores += position_bias + attn_weights = nn.functional.softmax( + scores.float(), dim=-1).type_as( + scores) # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training + ) # (batch_size, n_heads, seq_length, key_length) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = attn_weights * layer_head_mask + + attn_output = unshape(torch.matmul( + attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) + + present_key_value_state = (key_states, + value_states) if (self.is_decoder + and use_cache) else None + outputs = (attn_output, ) + (present_key_value_state, ) + ( + position_bias, ) + + if output_attentions: + outputs = outputs + (attn_weights, ) + return outputs + + +class T5LayerSelfAttention(nn.Module): + + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.SelfAttention = T5Attention( + config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = T5LayerNorm( + config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.SelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states, + ) + attention_output[1:] # add attentions if we output them + return outputs + + +class T5LayerCrossAttention(nn.Module): + + def __init__(self, config): + super().__init__() + self.EncDecAttention = T5Attention( + config, has_relative_attention_bias=False) + self.layer_norm = T5LayerNorm( + config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + key_value_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_value=None, + use_cache=False, + query_length=None, + output_attentions=False, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.EncDecAttention( + normed_hidden_states, + mask=attention_mask, + key_value_states=key_value_states, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + query_length=query_length, + output_attentions=output_attentions, + ) + layer_output = hidden_states + self.dropout(attention_output[0]) + outputs = (layer_output, + ) + attention_output[1:] # add attentions if we output them + return outputs + + +class T5Block(nn.Module): + + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.is_decoder = config.is_decoder + self.layer = nn.ModuleList() + self.layer.append( + T5LayerSelfAttention( + config, + has_relative_attention_bias=has_relative_attention_bias)) + if self.is_decoder: + self.layer.append(T5LayerCrossAttention(config)) + + self.layer.append(T5LayerFF(config)) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): + + if past_key_value is not None: + if not self.is_decoder: + logger.warning( + '`past_key_values` is passed to the encoder. Please make sure this is intended.' + ) + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + + if len(past_key_value) != expected_num_past_key_values: + raise ValueError( + f'There should be {expected_num_past_key_values} past states. ' + f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" + f'Got {len(past_key_value)} past key / value states') + + self_attn_past_key_value = past_key_value[:2] + cross_attn_past_key_value = past_key_value[2:] + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=self_attn_past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states, present_key_value_state = self_attention_outputs[:2] + attention_outputs = self_attention_outputs[ + 2:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf( + hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp( + hidden_states, min=-clamp_value, max=clamp_value) + + do_cross_attention = self.is_decoder and encoder_hidden_states is not None + if do_cross_attention: + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here + if present_key_value_state is not None: + query_length = present_key_value_state[0].shape[2] + else: + query_length = None + + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + query_length=query_length, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf( + hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp( + hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states + if present_key_value_state is not None: + present_key_value_state = present_key_value_state + cross_attention_outputs[ + 1] + + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16 and torch.isinf( + hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp( + hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states, ) + + if use_cache: + outputs = outputs + (present_key_value_state, ) + attention_outputs + else: + outputs = outputs + attention_outputs + + # hidden-states, present_key_value_states, (self-attention position + # bias), (self-attention weights), (cross-attention position bias), + # (cross-attention weights) + return outputs + + +class T5PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface + for downloading and loading pretrained models. + """ + + config_class = T5Config + load_tf_weights = load_tf_weights_in_t5 + base_model_prefix = 'transformer' + is_parallelizable = True + supports_gradient_checkpointing = True + + @property + def dummy_inputs(self): + input_ids = torch.tensor(DUMMY_INPUTS) + input_mask = torch.tensor(DUMMY_MASK) + dummy_inputs = { + 'decoder_input_ids': input_ids, + 'input_ids': input_ids, + 'decoder_attention_mask': input_mask, + } + return dummy_inputs + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor # Used for testing weights initialization + if isinstance(module, T5LayerNorm): + module.weight.data.fill_(factor * 1.0) + elif isinstance(module, + (T5Model, T5ForConditionalGeneration, T5EncoderModel)): + # Mesh TensorFlow embeddings initialization See + # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) + elif isinstance(module, T5DenseReluDense): + # Mesh TensorFlow FF initialization See + # https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 + # and + # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 + module.wi.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_model)**-0.5)) + if hasattr(module.wi, 'bias') and module.wi.bias is not None: + module.wi.bias.data.zero_() + module.wo.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_ff)**-0.5)) + if hasattr(module.wo, 'bias') and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, T5DenseGatedGeluDense): + module.wi_0.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_model)**-0.5)) + if hasattr(module.wi_0, 'bias') and module.wi_0.bias is not None: + module.wi_0.bias.data.zero_() + module.wi_1.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_model)**-0.5)) + if hasattr(module.wi_1, 'bias') and module.wi_1.bias is not None: + module.wi_1.bias.data.zero_() + module.wo.weight.data.normal_( + mean=0.0, std=factor * ((self.config.d_ff)**-0.5)) + if hasattr(module.wo, 'bias') and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, T5Attention): + # Mesh TensorFlow attention initialization to avoid scaling before + # softmax See + # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 + d_model = self.config.d_model + key_value_proj_dim = self.config.d_kv + n_heads = self.config.num_heads + module.q.weight.data.normal_( + mean=0.0, std=factor * ((d_model * key_value_proj_dim)**-0.5)) + module.k.weight.data.normal_( + mean=0.0, std=factor * (d_model**-0.5)) + module.v.weight.data.normal_( + mean=0.0, std=factor * (d_model**-0.5)) + module.o.weight.data.normal_( + mean=0.0, std=factor * ((n_heads * key_value_proj_dim)**-0.5)) + if module.has_relative_attention_bias: + module.relative_attention_bias.weight.data.normal_( + mean=0.0, std=factor * ((d_model)**-0.5)) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (T5Attention, T5Stack)): + module.gradient_checkpointing = value + + def _shift_right(self, input_ids): + decoder_start_token_id = self.config.decoder_start_token_id + pad_token_id = self.config.pad_token_id + + assert ( + decoder_start_token_id is not None + ), 'self.model.config.decoder_start_token_id has to be defined.' + + # shift inputs to the right + if is_torch_fx_proxy(input_ids): + # Item assignment is not supported natively for proxies. + shifted_input_ids = torch.full(input_ids.shape[:-1] + (1, ), + decoder_start_token_id) + shifted_input_ids = torch.cat( + [shifted_input_ids, input_ids[..., :-1]], dim=-1) + else: + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = decoder_start_token_id + + assert pad_token_id is not None, 'self.model.config.pad_token_id has to be defined.' + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + assert torch.all(shifted_input_ids >= 0).item( + ), 'Verify that `shifted_input_ids` has only positive values' + + return shifted_input_ids + + +class T5Stack(T5PreTrainedModel): + + def __init__(self, config, embed_tokens=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder + + self.block = nn.ModuleList([ + T5Block(config, has_relative_attention_bias=bool(i == 0)) + for i in range(config.num_layers) + ]) + self.final_layer_norm = T5LayerNorm( + config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + # Initialize weights and apply final processing + self.post_init() + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + # Check validity of device_map + self.device_map = ( + get_device_map(len(self.block), range(torch.cuda.device_count())) + if device_map is None else device_map) + assert_device_map(self.device_map, len(self.block)) + self.model_parallel = True + self.first_device = 'cpu' if 'cpu' in self.device_map.keys( + ) else 'cuda:' + str(min(self.device_map.keys())) + self.last_device = 'cuda:' + str(max(self.device_map.keys())) + # Load onto devices + for k, v in self.device_map.items(): + for layer in v: + cuda_device = 'cuda:' + str(k) + self.block[layer] = self.block[layer].to(cuda_device) + + # Set embed_tokens to first layer + self.embed_tokens = self.embed_tokens.to(self.first_device) + # Set final layer norm to last device + self.final_layer_norm = self.final_layer_norm.to(self.last_device) + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def deparallelize(self): + self.model_parallel = False + self.device_map = None + self.first_device = 'cpu' + self.last_device = 'cpu' + for i in range(len(self.block)): + self.block[i] = self.block[i].to('cpu') + self.embed_tokens = self.embed_tokens.to('cpu') + self.final_layer_norm = self.final_layer_norm.to('cpu') + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + # Model parallel + if self.model_parallel: + torch.cuda.set_device(self.first_device) + self.embed_tokens = self.embed_tokens.to(self.first_device) + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + err_msg_prefix = 'decoder_' if self.is_decoder else '' + raise ValueError( + f'You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time' + ) + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + err_msg_prefix = 'decoder_' if self.is_decoder else '' + raise ValueError( + f'You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds' + ) + + if inputs_embeds is None: + assert self.embed_tokens is not None, 'You have to initialize the model with valid token embeddings' + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape + + # required mask seq length can be calculated via length of past + mask_seq_length = past_key_values[0][0].shape[ + 2] + seq_length if past_key_values is not None else seq_length + + if use_cache is True: + assert self.is_decoder, f'`use_cache` can only be set to `True` if {self} is used as a decoder' + + if attention_mask is None: + attention_mask = torch.ones(batch_size, mask_seq_length).to( + inputs_embeds.device) + if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: + encoder_seq_length = encoder_hidden_states.shape[1] + encoder_attention_mask = torch.ones( + batch_size, + encoder_seq_length, + device=inputs_embeds.device, + dtype=torch.long) + + # initialize past_key_values with `None` if past does not exist + if past_key_values is None: + past_key_values = [None] * len(self.block) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape, inputs_embeds.device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( + ) + encoder_hidden_shape = (encoder_batch_size, + encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=inputs_embeds.device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, + self.config.num_layers) + present_key_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions + and self.is_decoder) else None + position_bias = None + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) + + for i, (layer_module, + past_key_value) in enumerate(zip(self.block, past_key_values)): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if position_bias is not None: + position_bias = position_bias.to(hidden_states.device) + if encoder_hidden_states is not None: + encoder_hidden_states = encoder_hidden_states.to( + hidden_states.device) + if encoder_extended_attention_mask is not None: + encoder_extended_attention_mask = encoder_extended_attention_mask.to( + hidden_states.device) + if encoder_decoder_position_bias is not None: + encoder_decoder_position_bias = encoder_decoder_position_bias.to( + hidden_states.device) + if layer_head_mask is not None: + layer_head_mask = layer_head_mask.to(hidden_states.device) + if cross_attn_layer_head_mask is not None: + cross_attn_layer_head_mask = cross_attn_layer_head_mask.to( + hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning( + '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' + ) + use_cache = False + + def create_custom_forward(module): + + def custom_forward(*inputs): + return tuple( + module(*inputs, use_cache, output_attentions)) + + return custom_forward + + layer_outputs = checkpoint( + create_custom_forward(layer_module), + hidden_states, + extended_attention_mask, + position_bias, + encoder_hidden_states, + encoder_extended_attention_mask, + encoder_decoder_position_bias, + layer_head_mask, + cross_attn_layer_head_mask, + None, # past_key_value is always None with gradient checkpointing + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask=extended_attention_mask, + position_bias=position_bias, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + # layer_outputs is a tuple with: hidden-states, key-value-states, + # (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) + if use_cache is False: + layer_outputs = layer_outputs[:1] + ( + None, ) + layer_outputs[1:] + + hidden_states, present_key_value_state = layer_outputs[:2] + + # We share the position biases between the layers - the first layer + # store them layer_outputs = hidden-states, key-value-states + # (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) + position_bias = layer_outputs[2] + if self.is_decoder and encoder_hidden_states is not None: + encoder_decoder_position_bias = layer_outputs[ + 4 if output_attentions else 3] + # append next layer key value states + if use_cache: + present_key_value_states = present_key_value_states + ( + present_key_value_state, ) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3], ) + if self.is_decoder: + all_cross_attentions = all_cross_attentions + ( + layer_outputs[5], ) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and 'cuda:' + str(k) != self.last_device: + hidden_states = hidden_states.to('cuda:' + str(k + 1)) + + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + if not return_dict: + return tuple(v for v in [ + hidden_states, + present_key_value_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] if v is not None) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_value_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +T5_START_DOCSTRING = r""" + + The T5 model was proposed in [Exploring the Limits of Transfer Learning with + a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by + Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, + Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder + transformer pre-trained in a text-to-text denoising generative setting. + + This model inherits from [`PreTrainedModel`]. Check the superclass + documentation for the generic methods the library implements for all its + model (such as downloading or saving, resizing the input embeddings, pruning + heads etc.) + + This model is also a PyTorch + [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch + documentation for all matter related to general usage and behavior. + + Parameters: + config ([`T5Config`]): Model configuration class with all the parameters + of the model. + Initializing with a config file does not load the weights associated + with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model + weights. +""" + +T5_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. T5 is a model + with relative position embeddings so you should be able to pad the + inputs on both the right and the left. + + Indices can be obtained using [`T5Tokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] + for detail. + + [What are input IDs?](../glossary#input-ids) + + To know more on how to prepare `input_ids` for pretraining take a + look a [T5 Training](./t5#training). + attention_mask (`torch.FloatTensor` of shape `(batch_size, + sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask + values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, + target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`T5Tokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] + for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + T5 uses the `pad_token_id` as the starting token for + `decoder_input_ids` generation. If `past_key_values` is used, + optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + To know more on how to prepare `decoder_input_ids` for pretraining + take a look at [T5 Training](./t5#training). + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, + target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in + `decoder_input_ids`. Causal mask will also be used by default. + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, + num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the + encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or + `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the + decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or + `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in + the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, + `optional`: *attentions*) `last_hidden_state` of shape `(batch_size, + sequence_length, hidden_size)` is a sequence of hidden states at the + output of the last layer of the encoder. Used in the cross-attention + of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length + `config.n_layers` with each tuple having 4 tensors of shape + `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention + blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only + the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead + of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to + directly pass an embedded representation. This is useful if you want + more control over how to convert `input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, + target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to + directly pass an embedded representation. If `past_key_values` is + used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more + control over how to convert `decoder_input_ids` indices into + associated vectors than the model's internal embedding lookup + matrix. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, + `decoder_inputs_embeds` takes the value of `inputs_embeds`. + + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned + and can be used to speed up decoding (see `past_key_values`). + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention + layers. See `attentions` under returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See + `hidden_states` under returned tensors for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain + tuple. +""" + +T5_ENCODER_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. T5 is a model + with relative position embeddings so you should be able to pad the + inputs on both the right and the left. + + Indices can be obtained using [`T5Tokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] + for detail. + + To know more on how to prepare `input_ids` for pretraining take a + look a [T5 Training](./t5#training). + attention_mask (`torch.FloatTensor` of shape `(batch_size, + sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask + values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, + num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask + values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to + directly pass an embedded representation. This is useful if you want + more control over how to convert `input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention + layers. See `attentions` under returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See + `hidden_states` under returned tensors for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain + tuple. +""" + +# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask +__HEAD_MASK_WARNING_MSG = """ +The input argument `head_mask` was split into two arguments `head_mask` and +`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`, +but this feature is deprecated and will be removed in future versions. If you do +not want to use any `decoder_head_mask` now, please set `decoder_head_mask = +torch.ones(num_layers, num_heads)`. +""" + + +@add_start_docstrings( + 'The bare T5 Model transformer outputting raw hidden-states without any specific head on top.', + T5_START_DOCSTRING, +) +class T5Model(T5PreTrainedModel): + _keys_to_ignore_on_load_missing = [ + r'encoder\.embed_tokens\.weight', + r'decoder\.embed_tokens\.weight', + ] + _keys_to_ignore_on_load_unexpected = [ + r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight', + ] + + def __init__(self, config: T5Config): + super().__init__(config) + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = T5Stack(decoder_config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map( + len(self.encoder.block), range(torch.cuda.device_count())) + if device_map is None else device_map) + assert_device_map(self.device_map, len(self.encoder.block)) + self.encoder.parallelize(self.device_map) + self.decoder.parallelize(self.device_map) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.encoder.deparallelize() + self.decoder.deparallelize() + self.encoder = self.encoder.to('cpu') + self.decoder = self.decoder.to('cpu') + self.model_parallel = False + self.device_map = None + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + self.decoder.set_input_embeddings(new_embeddings) + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of + heads to prune in this layer} See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: + r""" + Returns: + + Example: + + ```python >>> from transformers import T5Tokenizer, T5Model + + >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") + >>> model = T5Model.from_pretrained("t5-small") + + >>> input_ids = tokenizer( + ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" + >>> ).input_ids # Batch size 1 + >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 + + >>> # forward pass + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask + if head_mask is not None and decoder_head_mask is None: + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask + + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] + if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] + if len(encoder_outputs) > 2 else None, + ) + + hidden_states = encoder_outputs[0] + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + hidden_states = hidden_states.to(self.decoder.first_device) + if decoder_input_ids is not None: + decoder_input_ids = decoder_input_ids.to( + self.decoder.first_device) + if attention_mask is not None: + attention_mask = attention_mask.to(self.decoder.first_device) + if decoder_attention_mask is not None: + decoder_attention_mask = decoder_attention_mask.to( + self.decoder.first_device) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", + T5_START_DOCSTRING) +class T5ForConditionalGeneration(T5PreTrainedModel): + _keys_to_ignore_on_load_missing = [ + r'encoder\.embed_tokens\.weight', + r'decoder\.embed_tokens\.weight', + r'lm_head\.weight', + ] + _keys_to_ignore_on_load_unexpected = [ + r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight', + ] + + def __init__(self, config: T5Config): + super().__init__(config) + self.model_dim = config.d_model + + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = T5Stack(decoder_config, self.shared) + + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map( + len(self.encoder.block), range(torch.cuda.device_count())) + if device_map is None else device_map) + assert_device_map(self.device_map, len(self.encoder.block)) + self.encoder.parallelize(self.device_map) + self.decoder.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.decoder.first_device) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.encoder.deparallelize() + self.decoder.deparallelize() + self.encoder = self.encoder.to('cpu') + self.decoder = self.decoder.to('cpu') + self.lm_head = self.lm_head.to('cpu') + self.model_parallel = False + self.device_map = None + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + self.decoder.set_input_embeddings(new_embeddings) + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def get_output_embeddings(self): + return self.lm_head + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. + Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All + labels set to `-100` are ignored (masked), the loss is only computed + for labels in `[0, ..., config.vocab_size]` + + Returns: + + Examples: + + ```python >>> from transformers import T5Tokenizer, + T5ForConditionalGeneration + + >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") + >>> model = T5ForConditionalGeneration.from_pretrained("t5-small") + + >>> # training + >>> input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids + >>> labels = tokenizer(" cute dog the ", return_tensors="pt").input_ids + >>> outputs = model(input_ids=input_ids, labels=labels) + >>> loss = outputs.loss + >>> logits = outputs.logits + + >>> # inference + >>> input_ids = tokenizer( + ... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt" + >>> ).input_ids # Batch size 1 + >>> outputs = model.generate(input_ids) + >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + >>> # studies have shown that owning a dog is good for you. + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask + if head_mask is not None and decoder_head_mask is None: + if self.config.num_layers == self.config.num_decoder_layers: + warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) + decoder_head_mask = head_mask + + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + # Convert encoder inputs in embeddings if needed + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] + if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] + if len(encoder_outputs) > 2 else None, + ) + + hidden_states = encoder_outputs[0] + + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.decoder.first_device) + hidden_states = hidden_states.to(self.decoder.first_device) + if decoder_input_ids is not None: + decoder_input_ids = decoder_input_ids.to( + self.decoder.first_device) + if attention_mask is not None: + attention_mask = attention_mask.to(self.decoder.first_device) + if decoder_attention_mask is not None: + decoder_attention_mask = decoder_attention_mask.to( + self.decoder.first_device) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = decoder_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.encoder.first_device) + self.lm_head = self.lm_head.to(self.encoder.first_device) + sequence_output = sequence_output.to(self.lm_head.weight.device) + + if self.config.tie_word_embeddings: + # Rescale output before projecting on vocab See + # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 + sequence_output = sequence_output * (self.model_dim**-0.5) + + lm_logits = self.lm_head(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct( + lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) + # TODO(thom): Add z_loss + # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 + + if not return_dict: + output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs + return ((loss, ) + output) if loss is not None else output + + return Seq2SeqLMOutput( + loss=loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + past=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs): + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + 'decoder_input_ids': input_ids, + 'past_key_values': past, + 'encoder_outputs': encoder_outputs, + 'attention_mask': attention_mask, + 'head_mask': head_mask, + 'decoder_head_mask': decoder_head_mask, + 'cross_attn_head_mask': cross_attn_head_mask, + 'use_cache': use_cache, + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return self._shift_right(labels) + + def _reorder_cache(self, past, beam_idx): + # if decoder past is not included in output + # speedy decoding is disabled and no need to reorder + if past is None: + logger.warning( + 'You might want to consider setting `use_cache=True` to speed up decoding' + ) + return past + + reordered_decoder_past = () + for layer_past_states in past: + # get the correct batch idx from layer past batch dim + # batch dim of `past` is at 2nd position + reordered_layer_past_states = () + for layer_past_state in layer_past_states: + # need to set correct `past` for each of the four key / value states + reordered_layer_past_states = reordered_layer_past_states + ( + layer_past_state.index_select( + 0, beam_idx.to(layer_past_state.device)), ) + + assert reordered_layer_past_states[0].shape == layer_past_states[ + 0].shape + assert len(reordered_layer_past_states) == len(layer_past_states) + + reordered_decoder_past = reordered_decoder_past + ( + reordered_layer_past_states, ) + return reordered_decoder_past + + +@add_start_docstrings( + "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.", + T5_START_DOCSTRING, +) +class T5EncoderModel(T5PreTrainedModel): + authorized_missing_keys = [ + r'encoder\.embed_tokens\.weight', + ] + + def __init__(self, config: T5Config): + super().__init__(config) + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map( + len(self.encoder.block), range(torch.cuda.device_count())) + if device_map is None else device_map) + assert_device_map(self.device_map, len(self.encoder.block)) + self.encoder.parallelize(self.device_map) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.encoder.deparallelize() + self.encoder = self.encoder.to('cpu') + self.model_parallel = False + self.device_map = None + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + + def get_encoder(self): + return self.encoder + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of + heads to prune in this layer} See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]: + r""" + Returns: + + Example: + + ```python + >>> from transformers import T5Tokenizer, T5EncoderModel + + >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") + >>> model = T5EncoderModel.from_pretrained("t5-small") + >>> input_ids = tokenizer( + ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" + >>> ).input_ids # Batch size 1 + >>> outputs = model(input_ids=input_ids) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return encoder_outputs diff --git a/modelscope/models/nlp/T5/t5_for_text_generation.py b/modelscope/models/nlp/T5/t5_for_text_generation.py new file mode 100644 index 00000000..27f077d8 --- /dev/null +++ b/modelscope/models/nlp/T5/t5_for_text_generation.py @@ -0,0 +1,56 @@ +from typing import Optional, Tuple + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base import Tensor, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks +from .modeling_t5 import T5Config +from .modeling_t5 import T5ForConditionalGeneration as T5ForGeneration + + +@MODELS.register_module( + group_key=Tasks.text2text_generation, + module_name=Models.T5, +) +class T5ForConditionalGeneration(TorchModel): + + def __init__(self, model_dir=None, *args, **kwargs): + """initialize the text generation model from the `model_dir` path. + + Args: + model_dir (str): the model path. + model_cls (Optional[Any], optional): model loader, if None, use the + default loader to load model weights, by default None. + """ + super().__init__(model_dir, *args, **kwargs) + self.model = T5ForGeneration.from_pretrained(model_dir) + self.generate = self.model.generate + self.config = self.model.config + + def forward(self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs): + return self.model.forward( + self, input_ids, attention_mask, decoder_input_ids, + decoder_attention_mask, head_mask, decoder_head_mask, + cross_attn_head_mask, encoder_outputs, past_key_values, + inputs_embeds, decoder_inputs_embeds, labels, use_cache, + output_attentions, output_hidden_states, return_dict, **kwargs) diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 443cb214..152a32dc 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -32,7 +32,7 @@ if TYPE_CHECKING: from .token_classification import SbertForTokenClassification from .sentence_embedding import SentenceEmbedding from .passage_ranking import PassageRanking - + from .T5 import T5ForConditionalGeneration else: _import_structure = { 'backbones': ['SbertModel'], @@ -68,6 +68,7 @@ else: 'table_question_answering': ['TableQuestionAnswering'], 'sentence_embedding': ['SentenceEmbedding'], 'passage_ranking': ['PassageRanking'], + 'T5': ['T5ForConditionalGeneration'], } import sys diff --git a/modelscope/outputs.py b/modelscope/outputs.py index b3eb9ad8..a80cbf33 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -390,12 +390,19 @@ TASK_OUTPUTS = { Tasks.text_error_correction: [OutputKeys.OUTPUT], Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES], Tasks.passage_ranking: [OutputKeys.SCORES], + # text generation result for single sample # { # "text": "this is the text generated by a model." # } Tasks.text_generation: [OutputKeys.TEXT], + # text generation result for single sample + # { + # "text": "北京" + # } + Tasks.text2text_generation: [OutputKeys.TEXT], + # fill mask result for single sample # { # "text": "this is the text which masks filled by model." diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index b5c53f82..a8edc21a 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -12,7 +12,7 @@ if TYPE_CHECKING: from .document_segmentation_pipeline import DocumentSegmentationPipeline from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline from .fill_mask_pipeline import FillMaskPipeline - from .fill_mask_ponet_pipeline import FillMaskPoNetPreprocessor + from .fill_mask_ponet_pipeline import FillMaskPonetPipeline from .information_extraction_pipeline import InformationExtractionPipeline from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline @@ -22,6 +22,7 @@ if TYPE_CHECKING: from .text_classification_pipeline import TextClassificationPipeline from .text_error_correction_pipeline import TextErrorCorrectionPipeline from .text_generation_pipeline import TextGenerationPipeline + from .text2text_generation_pipeline import Text2TextGenerationPipeline from .token_classification_pipeline import TokenClassificationPipeline from .translation_pipeline import TranslationPipeline from .word_segmentation_pipeline import WordSegmentationPipeline @@ -54,6 +55,7 @@ else: 'text_classification_pipeline': ['TextClassificationPipeline'], 'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'], 'text_generation_pipeline': ['TextGenerationPipeline'], + 'text2text_generation_pipeline': ['Text2TextGenerationPipeline'], 'token_classification_pipeline': ['TokenClassificationPipeline'], 'translation_pipeline': ['TranslationPipeline'], 'word_segmentation_pipeline': ['WordSegmentationPipeline'], diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py new file mode 100644 index 00000000..9ccd00f4 --- /dev/null +++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py @@ -0,0 +1,87 @@ +from typing import Any, Dict, Optional, Union + +import torch + +from modelscope.metainfo import Pipelines +from modelscope.models.base import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import Text2TextGenerationPreprocessor +from modelscope.utils.constant import Tasks + +__all__ = ['Text2TextGenerationPipeline'] + + +@PIPELINES.register_module( + Tasks.text2text_generation, module_name=Pipelines.text2text_generation) +class Text2TextGenerationPipeline(Pipeline): + + def __init__( + self, + model: Union[Model, str], + preprocessor: Optional[Text2TextGenerationPreprocessor] = None, + first_sequence='sentence', + **kwargs): + """Use `model` and `preprocessor` to create a text to text generation pipeline for prediction. + + Args: + model (str or Model): Supply either a local model dir which supported the text generation task, + or a model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + first_sequence: The key to read the first sentence in. + sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. + + NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' + param will have no effect. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipeline_ins = pipeline(task='text-generation', + >>> model='damo/nlp_palm2.0_text-generation_chinese-base') + >>> sentence1 = '本文总结了十个可穿戴产品的设计原则,而这些原则,同样也是笔者认为是这个行业最吸引人的地方:' + >>> '1.为人们解决重复性问题;2.从人开始,而不是从机器开始;3.要引起注意,但不要刻意;4.提升用户能力,而不是取代' + >>> print(pipeline_ins(sentence1)) + >>> # Or use the dict input: + >>> print(pipeline_ins({'sentence': sentence1})) + + To view other examples plese check the tests/pipelines/test_text_generation.py. + """ + model = model if isinstance(model, + Model) else Model.from_pretrained(model) + if preprocessor is None: + preprocessor = Text2TextGenerationPreprocessor( + model.model_dir, + sequence_length=kwargs.pop('sequence_length', 128)) + self.tokenizer = preprocessor.tokenizer + model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + + forward_params['min_length'] = forward_params.get( + 'min_length', self.model.config.min_length) + forward_params['max_length'] = forward_params.get( + 'max_length', self.model.config.max_length) + + with torch.no_grad(): + output_ids = self.model.generate(**inputs, **forward_params) + return {'output_ids': output_ids} + + def postprocess(self, inputs: Dict[str, Tensor], + **postprocess_params) -> Dict[str, str]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): _description_ + + Returns: + Dict[str, str]: the prediction results + """ + output = self.tokenizer.decode( + inputs['output_ids'][0], + skip_special_tokens=True, + ) + return {OutputKeys.TEXT: output} diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index ba03a35e..e37b3324 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -24,7 +24,7 @@ if TYPE_CHECKING: TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor, RelationExtractionPreprocessor, DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor, - PassageRankingPreprocessor, + PassageRankingPreprocessor, Text2TextGenerationPreprocessor, WordSegmentationBlankSetToLabelPreprocessor) from .space import (DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, @@ -57,6 +57,7 @@ else: 'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', 'RelationExtractionPreprocessor', + 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', 'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor' ], diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index eee5e80f..f305df27 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: Tokenize, SequenceClassificationPreprocessor, TextGenerationPreprocessor, TokenClassificationPreprocessor, SingleSentenceClassificationPreprocessor, + Text2TextGenerationPreprocessor, PairSentenceClassificationPreprocessor, FillMaskPreprocessor, ZeroShotClassificationPreprocessor, NERPreprocessor, FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor, @@ -27,6 +28,7 @@ else: 'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor', 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', 'RelationExtractionPreprocessor', + 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', 'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor' ], diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index 0a2495af..d294f517 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -26,6 +26,7 @@ __all__ = [ 'Tokenize', 'SequenceClassificationPreprocessor', 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', 'PairSentenceClassificationPreprocessor', + 'Text2TextGenerationPreprocessor', 'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor', 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', 'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor', @@ -442,6 +443,40 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): return features +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor) +class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in text generation. + """ + + def __init__(self, + model_dir: str, + tokenizer=None, + mode=ModeKeys.INFERENCE, + **kwargs): + self.tokenizer = self.build_tokenizer( + model_dir) if tokenizer is None else tokenizer + kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate') + kwargs['padding'] = kwargs.get('padding', False) + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + False) + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + super().__init__(model_dir, pair=False, mode=mode, **kwargs) + + def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: + text_a, _, _ = self.parse_text_and_label(data) + + inputs = self.tokenizer( + text_a, + return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, + **self.tokenize_kwargs) + + # This is produced by tokenizers but is an invalid generate kwargs + if 'token_type_ids' in inputs: + del inputs['token_type_ids'] + return inputs + + @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.text_gen_tokenizer) class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index d6b0da40..4c5d2f41 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -97,6 +97,7 @@ class NLPTasks(object): token_classification = 'token-classification' conversational = 'conversational' text_generation = 'text-generation' + text2text_generation = 'text2text-generation' task_oriented_conversation = 'task-oriented-conversation' dialog_intent_prediction = 'dialog-intent-prediction' dialog_state_tracking = 'dialog-state-tracking' diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py new file mode 100644 index 00000000..04cecf93 --- /dev/null +++ b/tests/pipelines/test_text2text_generation.py @@ -0,0 +1,61 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.models import Model +from modelscope.models.nlp import T5ForConditionalGeneration +from modelscope.pipelines import pipeline +from modelscope.pipelines.nlp import Text2TextGenerationPipeline +from modelscope.preprocessors import Text2TextGenerationPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): + + def setUp(self) -> None: + self.model_id = 'damo/t5-cn-base-test' + self.input = '中国的首都位于。' + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_T5(self): + cache_path = snapshot_download(self.model_id) + model = T5ForConditionalGeneration(cache_path) + preprocessor = Text2TextGenerationPreprocessor(cache_path) + pipeline1 = Text2TextGenerationPipeline(model, preprocessor) + pipeline2 = pipeline( + Tasks.text2text_generation, model=model, preprocessor=preprocessor) + print( + f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}' + ) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_pipeline_with_model_instance(self): + model = Model.from_pretrained(self.model_id) + preprocessor = Text2TextGenerationPreprocessor(model.model_dir) + pipeline_ins = pipeline( + task=Tasks.text2text_generation, + model=model, + preprocessor=preprocessor) + print(pipeline_ins(self.input)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_pipeline_with_model_id(self): + pipeline_ins = pipeline( + task=Tasks.text2text_generation, model=self.model_id) + print(pipeline_ins(self.input)) + + @unittest.skip( + 'only for test cases, there is no default official model yet') + def test_run_pipeline_without_model_id(self): + pipeline_ins = pipeline(task=Tasks.text2text_generation) + print(pipeline_ins(self.input)) + + @unittest.skip('demo compatibility test is only enabled on a needed-basis') + def test_demo_compatibility(self): + self.compatibility_check() + + +if __name__ == '__main__': + unittest.main() From 4dbdc45963a769d43afe2d75c1ebc7964c359c9d Mon Sep 17 00:00:00 2001 From: "hanyuan.chy" Date: Mon, 26 Sep 2022 13:23:32 +0800 Subject: [PATCH 03/23] test(data): add test data Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10246518 --- data/test/videos/Walking.54138969.mp4 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/test/videos/Walking.54138969.mp4 b/data/test/videos/Walking.54138969.mp4 index 1716695f..d4355290 100644 --- a/data/test/videos/Walking.54138969.mp4 +++ b/data/test/videos/Walking.54138969.mp4 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b8f50a0537bfe7e082c5ad91b2b7ece61a0adbeb7489988e553909276bf920c -size 44217644 +oid sha256:7663f9a32ea57086bf66c4b9e9ebe0fd418986c67716c7be02ca917e72ddc0ba +size 8155895 From b876839d51b81a14e6caaba87d6fb0c9f646a0c8 Mon Sep 17 00:00:00 2001 From: "shuying.shu" Date: Mon, 26 Sep 2022 14:03:35 +0800 Subject: [PATCH 04/23] [to #42322933]adjust output form adjust output form for movie scene segmentation demo Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10244194 --- .../models/cv/movie_scene_segmentation/model.py | 4 ++-- .../cv/movie_scene_segmentation/utils/save_op.py | 13 ++++++------- modelscope/outputs.py | 11 +++++------ .../cv/movie_scene_segmentation_pipeline.py | 4 ++-- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py index 676b5ac1..1232d427 100644 --- a/modelscope/models/cv/movie_scene_segmentation/model.py +++ b/modelscope/models/cv/movie_scene_segmentation/model.py @@ -162,11 +162,11 @@ class MovieSceneSegmentationModel(TorchModel): thres = self.cfg.pipeline.save_threshold anno_dict = get_pred_boundary(pred_dict, thres) - scene_dict, scene_list = pred2scene(self.shot2keyf, anno_dict) + scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict) if self.cfg.pipeline.save_split_scene: re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) print(f'Split scene video saved to {re_dir}') - return len(scene_list), scene_dict + return len(scene_list), scene_dict_lst def preprocess(self, inputs): logger.info('Begin shot detect......') diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py index cf26d21a..6361c056 100644 --- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py +++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py @@ -21,16 +21,15 @@ def get_pred_boundary(pred_dict, threshold=0.5): def pred2scene(shot2keyf, anno_dict): scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) - scene_dict = {} + scene_dict_lst = [] assert len(scene_list) == len(pair_list) for scene_ind, scene_item in enumerate(scene_list): - scene_dict.update( - {scene_ind: { - 'shot': pair_list[scene_ind], - 'frame': scene_item - }}) + scene_dict_lst.append({ + 'shot': pair_list[scene_ind], + 'frame': scene_item + }) - return scene_dict, scene_list + return scene_dict_lst, scene_list def scene2video(source_movie_fn, scene_list, thres): diff --git a/modelscope/outputs.py b/modelscope/outputs.py index a80cbf33..052d4f33 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -38,7 +38,7 @@ class OutputKeys(object): HISTORY = 'history' TIMESTAMPS = 'timestamps' SPLIT_VIDEO_NUM = 'split_video_num' - SPLIT_META_DICT = 'split_meta_dict' + SPLIT_META_LIST = 'split_meta_list' TASK_OUTPUTS = { @@ -293,18 +293,17 @@ TASK_OUTPUTS = { # movide scene segmentation result for a single video # { # "split_video_num":3, - # "split_meta_dict": - # { - # scene_id: + # "split_meta_list": + # [ # { # "shot": [0,1,2], # "frame": [start_frame, end_frame] # } - # } + # ] # # } Tasks.movie_scene_segmentation: - [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_DICT], + [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST], # ============ nlp tasks =================== diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py index b5acf17a..6704e4c0 100644 --- a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py +++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py @@ -60,9 +60,9 @@ class MovieSceneSegmentationPipeline(Pipeline): def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: data = {'input_video_pth': self.input_video_pth, 'feat': inputs} - video_num, meta_dict = self.model.postprocess(data) + video_num, meta_lst = self.model.postprocess(data) result = { OutputKeys.SPLIT_VIDEO_NUM: video_num, - OutputKeys.SPLIT_META_DICT: meta_dict + OutputKeys.SPLIT_META_LIST: meta_lst } return result From bd4127bc27120f460f90f5f75832d8d3830e5b06 Mon Sep 17 00:00:00 2001 From: "tianchu.gtc" Date: Mon, 26 Sep 2022 15:49:35 +0800 Subject: [PATCH 05/23] =?UTF-8?q?[to=20#42322933]segformer=20=E6=8E=A5?= =?UTF-8?q?=E5=85=A5demo=E6=8E=A5=E5=8F=A3=E6=9B=B4=E6=94=B9=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/Ma?= =?UTF-8?q?aS-lib/codereview/10253628?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../easycv_pipelines/segmentation_pipeline.py | 24 ++++++++++++++ .../test_segmentation_pipeline.py | 32 ++++++++++--------- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py index 2182e3b3..bd09fc9b 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py @@ -1,5 +1,10 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any + +import numpy as np + from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys from modelscope.pipelines.builder import PIPELINES from modelscope.utils.constant import Tasks from .base import EasyCVPipeline @@ -21,3 +26,22 @@ class EasyCVSegmentationPipeline(EasyCVPipeline): model_file_pattern=model_file_pattern, *args, **kwargs) + + def __call__(self, inputs) -> Any: + outputs = self.predict_op(inputs) + + semantic_result = outputs[0]['seg_pred'] + + ids = np.unique(semantic_result)[::-1] + legal_indices = ids != len(self.predict_op.CLASSES) # for VOID label + ids = ids[legal_indices] + segms = (semantic_result[None] == ids[:, None, None]) + masks = [it.astype(np.int) for it in segms] + labels_txt = np.array(self.predict_op.CLASSES)[ids].tolist() + + results = { + OutputKeys.MASKS: masks, + OutputKeys.LABELS: labels_txt, + OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))] + } + return results diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py index 80ab36a6..5f6dac4b 100644 --- a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py +++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py @@ -2,30 +2,34 @@ import unittest from distutils.version import LooseVersion +import cv2 import easycv import numpy as np from PIL import Image +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks +from modelscope.utils.cv.image_utils import semantic_seg_masks_to_image +from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level -class EasyCVSegmentationPipelineTest(unittest.TestCase): - +class EasyCVSegmentationPipelineTest(unittest.TestCase, + DemoCompatibilityCheck): img_path = 'data/test/images/image_segmentation.jpg' - def _internal_test_(self, model_id): - img = np.asarray(Image.open(self.img_path)) + def setUp(self) -> None: + self.task = Tasks.image_segmentation + self.model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k' + def _internal_test_(self, model_id): semantic_seg = pipeline(task=Tasks.image_segmentation, model=model_id) outputs = semantic_seg(self.img_path) - self.assertEqual(len(outputs), 1) - - results = outputs[0] - self.assertListEqual( - list(img.shape)[:2], list(results['seg_pred'].shape)) + draw_img = semantic_seg_masks_to_image(outputs[OutputKeys.MASKS]) + cv2.imwrite('result.jpg', draw_img) + print('test ' + model_id + ' DONE') def _internal_test_batch_(self, model_id, num_samples=2, batch_size=2): # TODO: support in the future @@ -49,37 +53,35 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase): def test_segformer_b0(self): model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k' self._internal_test_(model_id) - self._internal_test_batch_(model_id) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_segformer_b1(self): model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k' self._internal_test_(model_id) - self._internal_test_batch_(model_id) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_segformer_b2(self): model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k' self._internal_test_(model_id) - self._internal_test_batch_(model_id) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_segformer_b3(self): model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k' self._internal_test_(model_id) - self._internal_test_batch_(model_id) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_segformer_b4(self): model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k' self._internal_test_(model_id) - self._internal_test_batch_(model_id) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_segformer_b5(self): model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k' self._internal_test_(model_id) - self._internal_test_batch_(model_id) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_demo_compatibility(self): + self.compatibility_check() if __name__ == '__main__': From f844f73b03ed5c47ef6e32ec9359c8984af8a02a Mon Sep 17 00:00:00 2001 From: "leyuan.hjy" Date: Mon, 26 Sep 2022 15:52:03 +0800 Subject: [PATCH 06/23] =?UTF-8?q?[to=20#42322933]=E4=BF=AE=E5=A4=8Dnano?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=88=9D=E5=A7=8B=E5=8C=96/=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E6=96=87=E4=BB=B6copyright=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修复nano模型初始化/增加文件copyright信息 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10247456 --- .../cv/realtime_object_detection/realtime_detector.py | 7 ++++++- .../yolox/exp/default/yolox_nano.py | 3 ++- .../pipelines/cv/realtime_object_detection_pipeline.py | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/modelscope/models/cv/realtime_object_detection/realtime_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_detector.py index b147f769..2b4b3f8c 100644 --- a/modelscope/models/cv/realtime_object_detection/realtime_detector.py +++ b/modelscope/models/cv/realtime_object_detection/realtime_detector.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import argparse import logging as logger import os @@ -48,6 +49,7 @@ class RealtimeDetector(TorchModel): self.nmsthre = self.exp.nmsthre self.test_size = self.exp.test_size self.preproc = ValTransform(legacy=False) + self.label_mapping = self.config['labels'] def inference(self, img): with torch.no_grad(): @@ -81,5 +83,8 @@ class RealtimeDetector(TorchModel): bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio scores = outputs[0][:, 5].cpu().numpy() labels = outputs[0][:, 6].cpu().int().numpy() + pred_label_names = [] + for lab in labels: + pred_label_names.append(self.label_mapping[lab]) - return bboxes, scores, labels + return bboxes, scores, pred_label_names diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py index 330eef16..7bada485 100644 --- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py +++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py @@ -42,5 +42,6 @@ class YoloXNanoExp(YoloXExp): act=self.act, depthwise=True) self.model = YOLOX(backbone, head) - + self.model.apply(init_yolo) + self.model.head.initialize_biases(1e-2) return self.model diff --git a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py index 629720d1..9f558f88 100644 --- a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py +++ b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp from typing import Any, Dict, List, Union From 65cce5b9976db9873ceb3fa1687903546f679e0d Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Mon, 26 Sep 2022 16:12:17 +0800 Subject: [PATCH 07/23] [to #44902165] bump version to 0.4.5 --- modelscope/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/version.py b/modelscope/version.py index 9a8e054a..68eb9b68 100644 --- a/modelscope/version.py +++ b/modelscope/version.py @@ -1 +1 @@ -__version__ = '0.4.4' +__version__ = '0.4.5' From c498d88d48a8c8cdd85c963322795914dabc9f42 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Mon, 26 Sep 2022 17:38:13 +0800 Subject: [PATCH 08/23] [to #42322933] add license declaration 1. add license declaration Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10216802 --- .../metrics/sequence_classification_metric.py | 2 ++ modelscope/metrics/text_generation_metric.py | 2 ++ .../metrics/token_classification_metric.py | 2 ++ .../models/multi_modal/mplug/clip/__init__.py | 2 ++ .../models/multi_modal/mplug/predictor.py | 16 +++++++++++++ .../models/multi_modal/mplug_for_all_tasks.py | 2 ++ modelscope/models/nlp/backbones/structbert.py | 1 + .../nlp/bart_for_text_error_correction.py | 1 + .../nlp/bert_for_sequence_classification.py | 1 + .../models/nlp/csanmt_for_translation.py | 3 +++ .../nlp/gpt3/gpt3_for_text_generation.py | 1 + modelscope/models/nlp/gpt3/modeling_gpt3.py | 1 + .../nlp/heads/infromation_extraction_head.py | 5 +--- .../nlp/heads/sequence_classification_head.py | 1 + .../nlp/heads/token_classification_head.py | 1 + .../models/nlp/heads/torch_pretrain_head.py | 1 + modelscope/models/nlp/masked_language.py | 3 +-- .../nlp/nncrf_for_named_entity_recognition.py | 6 +++-- .../models/nlp/palm_v2/modeling_palm.py | 16 +++++++++++++ .../nlp/palm_v2/palm_for_text_generation.py | 1 + modelscope/models/nlp/passage_ranking.py | 2 ++ modelscope/models/nlp/sentence_embedding.py | 4 ++-- .../models/nlp/sequence_classification.py | 2 ++ .../nlp/task_models/information_extraction.py | 5 +--- .../task_models/sequence_classification.py | 1 + .../models/nlp/task_models/task_model.py | 1 + .../nlp/task_models/token_classification.py | 1 + modelscope/models/nlp/token_classification.py | 2 ++ .../nlp/dialog_state_tracking_pipeline.py | 2 ++ .../nlp/distributed_plug_pipeline.py | 2 ++ .../nlp/faq_question_answering_pipeline.py | 2 ++ .../pipelines/nlp/fill_mask_pipeline.py | 2 ++ .../nlp/information_extraction_pipeline.py | 5 ++-- .../nlp/named_entity_recognition_pipeline.py | 2 ++ .../pair_sentence_classification_pipeline.py | 2 ++ .../pipelines/nlp/passage_ranking_pipeline.py | 2 ++ .../nlp/sentence_embedding_pipeline.py | 2 ++ .../sequence_classification_pipeline_base.py | 2 ++ ...single_sentence_classification_pipeline.py | 2 ++ .../nlp/text_error_correction_pipeline.py | 2 ++ .../pipelines/nlp/text_generation_pipeline.py | 2 ++ .../nlp/token_classification_pipeline.py | 2 ++ .../pipelines/nlp/translation_pipeline.py | 2 ++ .../nlp/word_segmentation_pipeline.py | 2 ++ .../nlp/zero_shot_classification_pipeline.py | 2 ++ modelscope/preprocessors/__init__.py | 3 ++- modelscope/preprocessors/nlp/__init__.py | 1 + modelscope/preprocessors/nlp/nlp_base.py | 24 ++++++++++++------- .../nlp/csanmt_translation_trainer.py | 2 ++ .../trainers/nlp/passage_ranking_trainer.py | 2 ++ .../nlp/sequence_classification_trainer.py | 2 ++ .../nlp/space/dialog_intent_trainer.py | 2 ++ .../nlp/space/dialog_modeling_trainer.py | 2 ++ .../nlp/space/metrics/metrics_tracker.py | 4 +--- modelscope/trainers/nlp_trainer.py | 2 ++ 55 files changed, 139 insertions(+), 28 deletions(-) diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py index d795d8a2..51a829ef 100644 --- a/modelscope/metrics/sequence_classification_metric.py +++ b/modelscope/metrics/sequence_classification_metric.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Dict import numpy as np diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py index 6bdcbc58..f154281d 100644 --- a/modelscope/metrics/text_generation_metric.py +++ b/modelscope/metrics/text_generation_metric.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Dict from modelscope.metainfo import Metrics diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py index 53d13b6a..05b72170 100644 --- a/modelscope/metrics/token_classification_metric.py +++ b/modelscope/metrics/token_classification_metric.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import importlib from typing import Dict, List, Optional, Union diff --git a/modelscope/models/multi_modal/mplug/clip/__init__.py b/modelscope/models/multi_modal/mplug/clip/__init__.py index 05826f46..e6007a04 100644 --- a/modelscope/models/multi_modal/mplug/clip/__init__.py +++ b/modelscope/models/multi_modal/mplug/clip/__init__.py @@ -1 +1,3 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from .clip import load_from_config diff --git a/modelscope/models/multi_modal/mplug/predictor.py b/modelscope/models/multi_modal/mplug/predictor.py index c976baa1..6375d1d7 100755 --- a/modelscope/models/multi_modal/mplug/predictor.py +++ b/modelscope/models/multi_modal/mplug/predictor.py @@ -1,3 +1,19 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import torch diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py index d61fea10..64a7dd7b 100644 --- a/modelscope/models/multi_modal/mplug_for_all_tasks.py +++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os.path as osp from typing import Dict, List diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py index f47900c3..74735520 100644 --- a/modelscope/models/nlp/backbones/structbert.py +++ b/modelscope/models/nlp/backbones/structbert.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import BACKBONES diff --git a/modelscope/models/nlp/bart_for_text_error_correction.py b/modelscope/models/nlp/bart_for_text_error_correction.py index 2339f221..27abedb5 100644 --- a/modelscope/models/nlp/bart_for_text_error_correction.py +++ b/modelscope/models/nlp/bart_for_text_error_correction.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp from typing import Any, Dict diff --git a/modelscope/models/nlp/bert_for_sequence_classification.py b/modelscope/models/nlp/bert_for_sequence_classification.py index 75105f36..2b1a3b3b 100644 --- a/modelscope/models/nlp/bert_for_sequence_classification.py +++ b/modelscope/models/nlp/bert_for_sequence_classification.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os from typing import Any, Dict diff --git a/modelscope/models/nlp/csanmt_for_translation.py b/modelscope/models/nlp/csanmt_for_translation.py index 83b58060..4bac8e6d 100644 --- a/modelscope/models/nlp/csanmt_for_translation.py +++ b/modelscope/models/nlp/csanmt_for_translation.py @@ -1,3 +1,6 @@ +# Part of the implementation is borrowed and modified from THUMT, +# publicly available at https://github.com/THUNLP-MT/THUMT +# Copyright 2017-2022 The Alibaba MT Team Authors. All rights reserved. import math from collections import namedtuple from typing import Dict diff --git a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py index fe1402e8..d686ea30 100644 --- a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py +++ b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Dict from modelscope.metainfo import Models diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py index 69e9ba7c..498d15de 100644 --- a/modelscope/models/nlp/gpt3/modeling_gpt3.py +++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py @@ -1,3 +1,4 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/nlp/heads/infromation_extraction_head.py b/modelscope/models/nlp/heads/infromation_extraction_head.py index cf957834..6c3388f0 100644 --- a/modelscope/models/nlp/heads/infromation_extraction_head.py +++ b/modelscope/models/nlp/heads/infromation_extraction_head.py @@ -1,13 +1,10 @@ -from typing import Dict - +# Copyright (c) Alibaba, Inc. and its affiliates. import torch -import torch.nn.functional as F from torch import nn from modelscope.metainfo import Heads from modelscope.models.base import TorchHead from modelscope.models.builder import HEADS -from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py index e608f035..fb03b7ff 100644 --- a/modelscope/models/nlp/heads/sequence_classification_head.py +++ b/modelscope/models/nlp/heads/sequence_classification_head.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Dict import torch diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py index 481524ae..ace3deac 100644 --- a/modelscope/models/nlp/heads/token_classification_head.py +++ b/modelscope/models/nlp/heads/token_classification_head.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Dict import torch diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py index 6ff6c96f..fb54637b 100644 --- a/modelscope/models/nlp/heads/torch_pretrain_head.py +++ b/modelscope/models/nlp/heads/torch_pretrain_head.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Dict import torch diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py index 4f466c23..514a04cd 100644 --- a/modelscope/models/nlp/masked_language.py +++ b/modelscope/models/nlp/masked_language.py @@ -1,6 +1,5 @@ -from typing import Any, Dict, Optional, Union +# Copyright (c) Alibaba, Inc. and its affiliates. -import numpy as np from transformers import BertForMaskedLM as BertForMaskedLMTransformer from modelscope.metainfo import Models diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py index 37216510..62198ed2 100644 --- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py +++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py @@ -1,3 +1,7 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. All rights reserved. +# The CRF implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp) +# and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications. + import os from typing import Any, Dict, List, Optional @@ -208,8 +212,6 @@ class CRF(nn.Module): Learning*. Morgan Kaufmann. pp. 282–289. .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm - The implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp) - and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications. """ def __init__(self, num_tags: int, batch_first: bool = False) -> None: diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py index 99b00454..f395ebd4 100644 --- a/modelscope/models/nlp/palm_v2/modeling_palm.py +++ b/modelscope/models/nlp/palm_v2/modeling_palm.py @@ -1,3 +1,19 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import codecs import copy import math diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py index ae92427e..2c37afd6 100644 --- a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py +++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Dict, List from modelscope.metainfo import Models diff --git a/modelscope/models/nlp/passage_ranking.py b/modelscope/models/nlp/passage_ranking.py index 68bca231..2a06ce45 100644 --- a/modelscope/models/nlp/passage_ranking.py +++ b/modelscope/models/nlp/passage_ranking.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict import numpy as np diff --git a/modelscope/models/nlp/sentence_embedding.py b/modelscope/models/nlp/sentence_embedding.py index 955c0e53..340c133f 100644 --- a/modelscope/models/nlp/sentence_embedding.py +++ b/modelscope/models/nlp/sentence_embedding.py @@ -1,7 +1,7 @@ -import os +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict -import json import numpy as np from modelscope.metainfo import Models diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py index e8802dbd..a8930e68 100644 --- a/modelscope/models/nlp/sequence_classification.py +++ b/modelscope/models/nlp/sequence_classification.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from abc import abstractmethod from torch import nn diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py index 20a44787..4792d07c 100644 --- a/modelscope/models/nlp/task_models/information_extraction.py +++ b/modelscope/models/nlp/task_models/information_extraction.py @@ -1,7 +1,7 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict import numpy as np -import torch from modelscope.metainfo import TaskModels from modelscope.models.builder import MODELS @@ -9,9 +9,6 @@ from modelscope.models.nlp.task_models.task_model import \ SingleBackboneTaskModelBase from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks -from modelscope.utils.hub import parse_label_mapping -from modelscope.utils.tensor_utils import (torch_nested_detach, - torch_nested_numpify) __all__ = ['InformationExtractionModel'] diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py index 80bfd476..43a96327 100644 --- a/modelscope/models/nlp/task_models/sequence_classification.py +++ b/modelscope/models/nlp/task_models/sequence_classification.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os from typing import Any, Dict diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py index 104b4c32..e93dd5f6 100644 --- a/modelscope/models/nlp/task_models/task_model.py +++ b/modelscope/models/nlp/task_models/task_model.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path import re from abc import ABC diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py index 29679838..5c22098f 100644 --- a/modelscope/models/nlp/task_models/token_classification.py +++ b/modelscope/models/nlp/task_models/token_classification.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict import numpy as np diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py index 0be921d0..c3723a61 100644 --- a/modelscope/models/nlp/token_classification.py +++ b/modelscope/models/nlp/token_classification.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from abc import abstractmethod from typing import Dict diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py index 0d2c96d7..79d32ace 100644 --- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Union from modelscope.metainfo import Pipelines diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py index 202e6213..e5c05e86 100644 --- a/modelscope/pipelines/nlp/distributed_plug_pipeline.py +++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict import torch diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py index 65831a17..1d46d8fd 100644 --- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Union import torch diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index db6b61c6..12f4b80f 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os from typing import Any, Dict, Optional, Union diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py index 4cb138d6..07223d07 100644 --- a/modelscope/pipelines/nlp/information_extraction_pipeline.py +++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py @@ -1,11 +1,12 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Optional, Union import torch from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import (Preprocessor, RelationExtractionPreprocessor) diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index 8fbdde86..467d7aba 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Optional, Union import torch diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py index 5248db8c..bdb75c73 100644 --- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Union from modelscope.models.base import Model diff --git a/modelscope/pipelines/nlp/passage_ranking_pipeline.py b/modelscope/pipelines/nlp/passage_ranking_pipeline.py index c03e7b93..1d818ac0 100644 --- a/modelscope/pipelines/nlp/passage_ranking_pipeline.py +++ b/modelscope/pipelines/nlp/passage_ranking_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Optional, Union import torch diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py index 3ef6d06b..16dedb2e 100644 --- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py +++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Optional, Union import torch diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py index 28bbc732..3d8e8fea 100644 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py +++ b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Union import numpy as np diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py index 844c6839..0a2f6d25 100644 --- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Union from ...metainfo import Pipelines diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py index b63d8d36..8e9bf85d 100644 --- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py +++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Optional, Union import torch diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index 3d27ffa9..ea35763f 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Optional, Union import torch diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py index 804f8146..aabf48d8 100644 --- a/modelscope/pipelines/nlp/token_classification_pipeline.py +++ b/modelscope/pipelines/nlp/token_classification_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Optional, Union import torch diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py index e4893577..eb7f7f74 100644 --- a/modelscope/pipelines/nlp/translation_pipeline.py +++ b/modelscope/pipelines/nlp/translation_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os.path as osp from typing import Any, Dict diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index 7e8b22bc..9d4bb67f 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Optional, Union import torch diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py index 38c0ee77..fc7051c7 100644 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, Union import torch diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index e37b3324..b4be1845 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -24,7 +24,8 @@ if TYPE_CHECKING: TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor, RelationExtractionPreprocessor, DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor, - PassageRankingPreprocessor, Text2TextGenerationPreprocessor, + PassageRankingPreprocessor, SentenceEmbeddingPreprocessor, + Text2TextGenerationPreprocessor, WordSegmentationBlankSetToLabelPreprocessor) from .space import (DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index f305df27..8e75ae98 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -15,6 +15,7 @@ if TYPE_CHECKING: FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor, RelationExtractionPreprocessor, DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor, PassageRankingPreprocessor, + SentenceEmbeddingPreprocessor, WordSegmentationBlankSetToLabelPreprocessor) else: diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index d294f517..d6325eed 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -23,16 +23,24 @@ from modelscope.utils.type_assert import type_assert logger = get_logger() __all__ = [ - 'Tokenize', 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', + 'Tokenize', + 'SequenceClassificationPreprocessor', + 'TextGenerationPreprocessor', + 'TokenClassificationPreprocessor', 'PairSentenceClassificationPreprocessor', 'Text2TextGenerationPreprocessor', - 'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor', - 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', - 'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor', - 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', - 'RelationExtractionPreprocessor', 'DocumentSegmentationPreprocessor', - 'FillMaskPoNetPreprocessor' + 'SingleSentenceClassificationPreprocessor', + 'FillMaskPreprocessor', + 'ZeroShotClassificationPreprocessor', + 'NERPreprocessor', + 'SentenceEmbeddingPreprocessor', + 'PassageRankingPreprocessor', + 'FaqQuestionAnsweringPreprocessor', + 'SequenceLabelingPreprocessor', + 'RelationExtractionPreprocessor', + 'DocumentSegmentationPreprocessor', + 'FillMaskPoNetPreprocessor', + 'WordSegmentationBlankSetToLabelPreprocessor', ] diff --git a/modelscope/trainers/nlp/csanmt_translation_trainer.py b/modelscope/trainers/nlp/csanmt_translation_trainer.py index 62ae91a8..c93599c7 100644 --- a/modelscope/trainers/nlp/csanmt_translation_trainer.py +++ b/modelscope/trainers/nlp/csanmt_translation_trainer.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os.path as osp from typing import Dict, Optional diff --git a/modelscope/trainers/nlp/passage_ranking_trainer.py b/modelscope/trainers/nlp/passage_ranking_trainer.py index e54c2904..711fd0c4 100644 --- a/modelscope/trainers/nlp/passage_ranking_trainer.py +++ b/modelscope/trainers/nlp/passage_ranking_trainer.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import time from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Tuple, Union diff --git a/modelscope/trainers/nlp/sequence_classification_trainer.py b/modelscope/trainers/nlp/sequence_classification_trainer.py index 64fd59b4..ec46e037 100644 --- a/modelscope/trainers/nlp/sequence_classification_trainer.py +++ b/modelscope/trainers/nlp/sequence_classification_trainer.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import time from typing import Dict, Optional, Tuple, Union diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py index c559ee5b..2e59cd80 100644 --- a/modelscope/trainers/nlp/space/dialog_intent_trainer.py +++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os import time from typing import Callable, Dict, Optional, Tuple, Union diff --git a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py index 6bdd8a3a..726404d4 100644 --- a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py +++ b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os import time from typing import Callable, Dict, Optional, Tuple, Union diff --git a/modelscope/trainers/nlp/space/metrics/metrics_tracker.py b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py index 865600d3..340077a6 100644 --- a/modelscope/trainers/nlp/space/metrics/metrics_tracker.py +++ b/modelscope/trainers/nlp/space/metrics/metrics_tracker.py @@ -1,6 +1,4 @@ -""" -MetricsTracker class -""" +# Copyright (c) Alibaba, Inc. and its affiliates. import math from collections import defaultdict diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index 4a14be31..b54aa666 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os from typing import Callable, Optional, Tuple, Union From c8be0e8b7837ef4d31c8a8c33d9238b0516a5d15 Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Tue, 27 Sep 2022 09:45:19 +0800 Subject: [PATCH 09/23] [to #44902165] remove device placement for image cartoon to avoid full gpu memory usage Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10260495 --- modelscope/pipelines/cv/image_cartoon_pipeline.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py index 72fda989..787aa06d 100644 --- a/modelscope/pipelines/cv/image_cartoon_pipeline.py +++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py @@ -37,15 +37,12 @@ class ImageCartoonPipeline(Pipeline): model: model id on modelscope hub. """ super().__init__(model=model, **kwargs) - with device_placement(self.framework, self.device_name): - self.facer = FaceAna(self.model) - with tf.Graph().as_default(): - self.sess_anime_head = self.load_sess( - os.path.join(self.model, 'cartoon_h.pb'), - 'model_anime_head') - self.sess_anime_bg = self.load_sess( - os.path.join(self.model, 'cartoon_bg.pb'), - 'model_anime_bg') + self.facer = FaceAna(self.model) + with tf.Graph().as_default(): + self.sess_anime_head = self.load_sess( + os.path.join(self.model, 'cartoon_h.pb'), 'model_anime_head') + self.sess_anime_bg = self.load_sess( + os.path.join(self.model, 'cartoon_bg.pb'), 'model_anime_bg') self.box_width = 288 global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg')) From 26df8f198820c3c079e38c8fdb94c2fd4d836581 Mon Sep 17 00:00:00 2001 From: "wendi.hwd" Date: Tue, 27 Sep 2022 15:01:05 +0800 Subject: [PATCH 10/23] [to #42322933]add semantic-segmentation task output is numpy mask for demo-service Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10265856 --- modelscope/models/cv/salient_detection/salient_model.py | 3 ++- modelscope/outputs.py | 6 ++++++ .../pipelines/cv/image_salient_detection_pipeline.py | 8 ++------ modelscope/utils/constant.py | 1 + tests/pipelines/test_salient_detection.py | 5 ++--- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/modelscope/models/cv/salient_detection/salient_model.py b/modelscope/models/cv/salient_detection/salient_model.py index 6e617f58..73c3c3fb 100644 --- a/modelscope/models/cv/salient_detection/salient_model.py +++ b/modelscope/models/cv/salient_detection/salient_model.py @@ -14,7 +14,8 @@ from modelscope.utils.constant import ModelFile, Tasks from .models import U2NET -@MODELS.register_module(Tasks.image_segmentation, module_name=Models.detection) +@MODELS.register_module( + Tasks.semantic_segmentation, module_name=Models.detection) class SalientDetection(TorchModel): def __init__(self, model_dir: str, *args, **kwargs): diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 052d4f33..b19f7e43 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -151,6 +151,12 @@ TASK_OUTPUTS = { Tasks.image_segmentation: [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS], + # semantic segmentation result for single sample + # { + # "masks": [np.array # 2D array containing only 0, 255] + # } + Tasks.semantic_segmentation: [OutputKeys.MASKS], + # image matting result for single sample # { # "output_img": np.array with shape(h, w, 4) diff --git a/modelscope/pipelines/cv/image_salient_detection_pipeline.py b/modelscope/pipelines/cv/image_salient_detection_pipeline.py index 433275ba..3b145cf0 100644 --- a/modelscope/pipelines/cv/image_salient_detection_pipeline.py +++ b/modelscope/pipelines/cv/image_salient_detection_pipeline.py @@ -9,7 +9,7 @@ from modelscope.utils.constant import Tasks @PIPELINES.register_module( - Tasks.image_segmentation, module_name=Pipelines.salient_detection) + Tasks.semantic_segmentation, module_name=Pipelines.salient_detection) class ImageSalientDetectionPipeline(Pipeline): def __init__(self, model: str, **kwargs): @@ -39,9 +39,5 @@ class ImageSalientDetectionPipeline(Pipeline): def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: data = self.model.postprocess(inputs) - outputs = { - OutputKeys.SCORES: None, - OutputKeys.LABELS: None, - OutputKeys.MASKS: data - } + outputs = {OutputKeys.MASKS: data} return outputs diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 4c5d2f41..de3d933f 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -38,6 +38,7 @@ class CVTasks(object): image_object_detection = 'image-object-detection' image_segmentation = 'image-segmentation' + semantic_segmentation = 'semantic-segmentation' portrait_matting = 'portrait-matting' text_driven_segmentation = 'text-driven-segmentation' shop_segmentation = 'shop-segmentation' diff --git a/tests/pipelines/test_salient_detection.py b/tests/pipelines/test_salient_detection.py index e87e9388..bcb904e6 100644 --- a/tests/pipelines/test_salient_detection.py +++ b/tests/pipelines/test_salient_detection.py @@ -11,17 +11,16 @@ from modelscope.utils.test_utils import test_level class SalientDetectionTest(unittest.TestCase, DemoCompatibilityCheck): def setUp(self) -> None: - self.task = Tasks.image_segmentation + self.task = Tasks.semantic_segmentation self.model_id = 'damo/cv_u2net_salient-detection' @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_salient_detection(self): input_location = 'data/test/images/image_salient_detection.jpg' model_id = 'damo/cv_u2net_salient-detection' - salient_detect = pipeline(Tasks.image_segmentation, model=model_id) + salient_detect = pipeline(Tasks.semantic_segmentation, model=model_id) result = salient_detect(input_location) import cv2 - # result[OutputKeys.MASKS] is salient map result,other keys are not used cv2.imwrite(input_location + '_salient.jpg', result[OutputKeys.MASKS]) @unittest.skip('demo compatibility test is only enabled on a needed-basis') From e90ff9e4795129eb8d64a2c4b67b3833217c7e1b Mon Sep 17 00:00:00 2001 From: "jiaqi.sjq" Date: Tue, 27 Sep 2022 22:09:30 +0800 Subject: [PATCH 11/23] [to #42322933] tts sambert am changs from tensorfow to PyTorch and add licenses * [to #41669377] docs and tools refinement and release 1. add build_doc linter script 2. add sphinx-docs support 3. add development doc and api doc 4. change version to 0.1.0 for the first internal release version Link: https://code.aone.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8775307 --- .../models/audio/tts/models/__init__.py | 9 - .../models/audio/tts/models/am_models.py | 460 ------- modelscope/models/audio/tts/models/compat.py | 82 -- .../tts/{text => models/datasets}/__init__.py | 0 .../tts/models/datasets/kantts_data4fs.py | 238 ++++ .../audio/tts/models/datasets/samplers.py | 131 ++ .../tts/models/datasets/units/__init__.py | 3 + .../tts/models/datasets/units/cleaners.py | 88 ++ .../tts/models/datasets/units/ling_unit.py | 395 ++++++ .../datasets/units}/numbers.py | 3 + modelscope/models/audio/tts/models/fsmn.py | 273 ---- .../models/audio/tts/models/fsmn_encoder.py | 178 --- modelscope/models/audio/tts/models/helpers.py | 159 --- .../audio/tts/models/models/__init__.py | 0 .../tts/models/models/hifigan/__init__.py | 3 + .../tts/models/models/hifigan/hifigan.py | 238 ++++ .../tts/models/models/sambert/__init__.py | 3 + .../tts/models/models/sambert/adaptors.py | 131 ++ .../audio/tts/models/models/sambert/base.py | 369 ++++++ .../audio/tts/models/models/sambert/fsmn.py | 126 ++ .../models/models/sambert/kantts_sambert.py | 718 ++++++++++ .../tts/models/models/sambert/positions.py | 101 ++ .../models/audio/tts/models/position.py | 174 --- modelscope/models/audio/tts/models/reducer.py | 155 --- .../models/audio/tts/models/rnn_wrappers.py | 237 ---- .../models/audio/tts/models/robutrans.py | 760 ----------- .../tts/models/self_attention_decoder.py | 817 ------------ .../tts/models/self_attention_encoder.py | 182 --- .../models/audio/tts/models/transformer.py | 1157 ----------------- modelscope/models/audio/tts/models/utils.py | 59 - .../models/audio/tts/models/utils/__init__.py | 3 + .../models/audio/tts/models/utils/utils.py | 136 ++ .../models/audio/tts/models/vocoder_models.py | 516 -------- modelscope/models/audio/tts/sambert_hifi.py | 34 +- modelscope/models/audio/tts/text/cleaners.py | 89 -- modelscope/models/audio/tts/text/cmudict.py | 64 - modelscope/models/audio/tts/text/symbols.py | 105 -- .../models/audio/tts/text/symbols_dict.py | 200 --- modelscope/models/audio/tts/voice.py | 333 ++--- .../audio/text_to_speech_pipeline.py | 5 + modelscope/utils/audio/tts_exceptions.py | 3 +- requirements/audio.txt | 5 - tests/pipelines/test_text_to_speech.py | 5 +- 43 files changed, 2799 insertions(+), 5948 deletions(-) mode change 100755 => 100644 modelscope/models/audio/tts/models/__init__.py delete mode 100755 modelscope/models/audio/tts/models/am_models.py delete mode 100755 modelscope/models/audio/tts/models/compat.py rename modelscope/models/audio/tts/{text => models/datasets}/__init__.py (100%) mode change 100755 => 100644 create mode 100644 modelscope/models/audio/tts/models/datasets/kantts_data4fs.py create mode 100644 modelscope/models/audio/tts/models/datasets/samplers.py create mode 100644 modelscope/models/audio/tts/models/datasets/units/__init__.py create mode 100644 modelscope/models/audio/tts/models/datasets/units/cleaners.py create mode 100644 modelscope/models/audio/tts/models/datasets/units/ling_unit.py rename modelscope/models/audio/tts/{text => models/datasets/units}/numbers.py (94%) mode change 100755 => 100644 delete mode 100755 modelscope/models/audio/tts/models/fsmn.py delete mode 100755 modelscope/models/audio/tts/models/fsmn_encoder.py delete mode 100755 modelscope/models/audio/tts/models/helpers.py create mode 100644 modelscope/models/audio/tts/models/models/__init__.py create mode 100644 modelscope/models/audio/tts/models/models/hifigan/__init__.py create mode 100755 modelscope/models/audio/tts/models/models/hifigan/hifigan.py create mode 100644 modelscope/models/audio/tts/models/models/sambert/__init__.py create mode 100644 modelscope/models/audio/tts/models/models/sambert/adaptors.py create mode 100644 modelscope/models/audio/tts/models/models/sambert/base.py create mode 100644 modelscope/models/audio/tts/models/models/sambert/fsmn.py create mode 100644 modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py create mode 100644 modelscope/models/audio/tts/models/models/sambert/positions.py delete mode 100755 modelscope/models/audio/tts/models/position.py delete mode 100755 modelscope/models/audio/tts/models/reducer.py delete mode 100755 modelscope/models/audio/tts/models/rnn_wrappers.py delete mode 100755 modelscope/models/audio/tts/models/robutrans.py delete mode 100755 modelscope/models/audio/tts/models/self_attention_decoder.py delete mode 100755 modelscope/models/audio/tts/models/self_attention_encoder.py delete mode 100755 modelscope/models/audio/tts/models/transformer.py delete mode 100755 modelscope/models/audio/tts/models/utils.py create mode 100644 modelscope/models/audio/tts/models/utils/__init__.py create mode 100755 modelscope/models/audio/tts/models/utils/utils.py delete mode 100755 modelscope/models/audio/tts/models/vocoder_models.py delete mode 100755 modelscope/models/audio/tts/text/cleaners.py delete mode 100755 modelscope/models/audio/tts/text/cmudict.py delete mode 100644 modelscope/models/audio/tts/text/symbols.py delete mode 100644 modelscope/models/audio/tts/text/symbols_dict.py diff --git a/modelscope/models/audio/tts/models/__init__.py b/modelscope/models/audio/tts/models/__init__.py old mode 100755 new mode 100644 index c260d4fe..e69de29b --- a/modelscope/models/audio/tts/models/__init__.py +++ b/modelscope/models/audio/tts/models/__init__.py @@ -1,9 +0,0 @@ -from .robutrans import RobuTrans -from .vocoder_models import Generator - - -def create_am_model(name, hparams): - if name == 'robutrans': - return RobuTrans(hparams) - else: - raise Exception('Unknown model: ' + name) diff --git a/modelscope/models/audio/tts/models/am_models.py b/modelscope/models/audio/tts/models/am_models.py deleted file mode 100755 index cd43ff12..00000000 --- a/modelscope/models/audio/tts/models/am_models.py +++ /dev/null @@ -1,460 +0,0 @@ -import tensorflow as tf - - -def encoder_prenet(inputs, - n_conv_layers, - filters, - kernel_size, - dense_units, - is_training, - mask=None, - scope='encoder_prenet'): - x = inputs - with tf.variable_scope(scope): - for i in range(n_conv_layers): - x = conv1d( - x, - filters, - kernel_size, - is_training, - activation=tf.nn.relu, - dropout=True, - mask=mask, - scope='conv1d_{}'.format(i)) - x = tf.layers.dense( - x, units=dense_units, activation=None, name='dense') - return x - - -def decoder_prenet(inputs, - prenet_units, - dense_units, - is_training, - scope='decoder_prenet'): - x = inputs - with tf.variable_scope(scope): - for i, units in enumerate(prenet_units): - x = tf.layers.dense( - x, - units=units, - activation=tf.nn.relu, - name='dense_{}'.format(i)) - x = tf.layers.dropout( - x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) - x = tf.layers.dense( - x, units=dense_units, activation=None, name='dense') - return x - - -def encoder(inputs, - input_lengths, - n_conv_layers, - filters, - kernel_size, - lstm_units, - is_training, - embedded_inputs_speaker, - mask=None, - scope='encoder'): - with tf.variable_scope(scope): - x = conv_and_lstm( - inputs, - input_lengths, - n_conv_layers, - filters, - kernel_size, - lstm_units, - is_training, - embedded_inputs_speaker, - mask=mask) - return x - - -def prenet(inputs, prenet_units, is_training, scope='prenet'): - x = inputs - with tf.variable_scope(scope): - for i, units in enumerate(prenet_units): - x = tf.layers.dense( - x, - units=units, - activation=tf.nn.relu, - name='dense_{}'.format(i)) - x = tf.layers.dropout( - x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) - return x - - -def postnet_residual_ulstm(inputs, - n_conv_layers, - filters, - kernel_size, - lstm_units, - output_units, - is_training, - scope='postnet_residual_ulstm'): - with tf.variable_scope(scope): - x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, - lstm_units, is_training) - x = conv1d( - x, - output_units, - kernel_size, - is_training, - activation=None, - dropout=False, - scope='conv1d_{}'.format(n_conv_layers - 1)) - return x - - -def postnet_residual_lstm(inputs, - n_conv_layers, - filters, - kernel_size, - lstm_units, - output_units, - is_training, - scope='postnet_residual_lstm'): - with tf.variable_scope(scope): - x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size, - lstm_units, is_training) - x = conv1d( - x, - output_units, - kernel_size, - is_training, - activation=None, - dropout=False, - scope='conv1d_{}'.format(n_conv_layers - 1)) - return x - - -def postnet_linear_ulstm(inputs, - n_conv_layers, - filters, - kernel_size, - lstm_units, - output_units, - is_training, - scope='postnet_linear'): - with tf.variable_scope(scope): - x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, - lstm_units, is_training) - x = tf.layers.dense(x, units=output_units) - return x - - -def postnet_linear_lstm(inputs, - n_conv_layers, - filters, - kernel_size, - lstm_units, - output_units, - output_lengths, - is_training, - embedded_inputs_speaker2, - mask=None, - scope='postnet_linear'): - with tf.variable_scope(scope): - x = conv_and_lstm_dec( - inputs, - output_lengths, - n_conv_layers, - filters, - kernel_size, - lstm_units, - is_training, - embedded_inputs_speaker2, - mask=mask) - x = tf.layers.dense(x, units=output_units) - return x - - -def postnet_linear(inputs, - n_conv_layers, - filters, - kernel_size, - lstm_units, - output_units, - output_lengths, - is_training, - embedded_inputs_speaker2, - mask=None, - scope='postnet_linear'): - with tf.variable_scope(scope): - x = conv_dec( - inputs, - output_lengths, - n_conv_layers, - filters, - kernel_size, - lstm_units, - is_training, - embedded_inputs_speaker2, - mask=mask) - return x - - -def conv_and_lstm(inputs, - sequence_lengths, - n_conv_layers, - filters, - kernel_size, - lstm_units, - is_training, - embedded_inputs_speaker, - mask=None, - scope='conv_and_lstm'): - from tensorflow.contrib.rnn import LSTMBlockCell - x = inputs - with tf.variable_scope(scope): - for i in range(n_conv_layers): - x = conv1d( - x, - filters, - kernel_size, - is_training, - activation=tf.nn.relu, - dropout=True, - mask=mask, - scope='conv1d_{}'.format(i)) - - x = tf.concat([x, embedded_inputs_speaker], axis=2) - - outputs, states = tf.nn.bidirectional_dynamic_rnn( - LSTMBlockCell(lstm_units), - LSTMBlockCell(lstm_units), - x, - sequence_length=sequence_lengths, - dtype=tf.float32) - x = tf.concat(outputs, axis=-1) - - return x - - -def conv_and_lstm_dec(inputs, - sequence_lengths, - n_conv_layers, - filters, - kernel_size, - lstm_units, - is_training, - embedded_inputs_speaker2, - mask=None, - scope='conv_and_lstm'): - x = inputs - from tensorflow.contrib.rnn import LSTMBlockCell - with tf.variable_scope(scope): - for i in range(n_conv_layers): - x = conv1d( - x, - filters, - kernel_size, - is_training, - activation=tf.nn.relu, - dropout=True, - mask=mask, - scope='conv1d_{}'.format(i)) - - x = tf.concat([x, embedded_inputs_speaker2], axis=2) - - outputs, states = tf.nn.bidirectional_dynamic_rnn( - LSTMBlockCell(lstm_units), - LSTMBlockCell(lstm_units), - x, - sequence_length=sequence_lengths, - dtype=tf.float32) - x = tf.concat(outputs, axis=-1) - return x - - -def conv_dec(inputs, - sequence_lengths, - n_conv_layers, - filters, - kernel_size, - lstm_units, - is_training, - embedded_inputs_speaker2, - mask=None, - scope='conv_and_lstm'): - x = inputs - with tf.variable_scope(scope): - for i in range(n_conv_layers): - x = conv1d( - x, - filters, - kernel_size, - is_training, - activation=tf.nn.relu, - dropout=True, - mask=mask, - scope='conv1d_{}'.format(i)) - x = tf.concat([x, embedded_inputs_speaker2], axis=2) - return x - - -def conv_and_ulstm(inputs, - sequence_lengths, - n_conv_layers, - filters, - kernel_size, - lstm_units, - is_training, - scope='conv_and_ulstm'): - x = inputs - with tf.variable_scope(scope): - for i in range(n_conv_layers): - x = conv1d( - x, - filters, - kernel_size, - is_training, - activation=tf.nn.relu, - dropout=True, - scope='conv1d_{}'.format(i)) - - outputs, states = tf.nn.dynamic_rnn( - LSTMBlockCell(lstm_units), - x, - sequence_length=sequence_lengths, - dtype=tf.float32) - - return outputs - - -def conv1d(inputs, - filters, - kernel_size, - is_training, - activation=None, - dropout=False, - mask=None, - scope='conv1d'): - with tf.variable_scope(scope): - if mask is not None: - inputs = inputs * tf.expand_dims(mask, -1) - x = tf.layers.conv1d( - inputs, filters=filters, kernel_size=kernel_size, padding='same') - if mask is not None: - x = x * tf.expand_dims(mask, -1) - - x = tf.layers.batch_normalization(x, training=is_training) - if activation is not None: - x = activation(x) - if dropout: - x = tf.layers.dropout(x, rate=0.5, training=is_training) - return x - - -def conv1d_dp(inputs, - filters, - kernel_size, - is_training, - activation=None, - dropout=False, - dropoutrate=0.5, - mask=None, - scope='conv1d'): - with tf.variable_scope(scope): - if mask is not None: - inputs = inputs * tf.expand_dims(mask, -1) - x = tf.layers.conv1d( - inputs, filters=filters, kernel_size=kernel_size, padding='same') - if mask is not None: - x = x * tf.expand_dims(mask, -1) - - x = tf.contrib.layers.layer_norm(x) - if activation is not None: - x = activation(x) - if dropout: - x = tf.layers.dropout(x, rate=dropoutrate, training=is_training) - return x - - -def duration_predictor(inputs, - n_conv_layers, - filters, - kernel_size, - lstm_units, - input_lengths, - is_training, - embedded_inputs_speaker, - mask=None, - scope='duration_predictor'): - with tf.variable_scope(scope): - x = inputs - for i in range(n_conv_layers): - x = conv1d_dp( - x, - filters, - kernel_size, - is_training, - activation=tf.nn.relu, - dropout=True, - dropoutrate=0.1, - mask=mask, - scope='conv1d_{}'.format(i)) - - x = tf.concat([x, embedded_inputs_speaker], axis=2) - - outputs, states = tf.nn.bidirectional_dynamic_rnn( - LSTMBlockCell(lstm_units), - LSTMBlockCell(lstm_units), - x, - sequence_length=input_lengths, - dtype=tf.float32) - x = tf.concat(outputs, axis=-1) - - x = tf.layers.dense(x, units=1) - x = tf.nn.relu(x) - return x - - -def duration_predictor2(inputs, - n_conv_layers, - filters, - kernel_size, - input_lengths, - is_training, - mask=None, - scope='duration_predictor'): - with tf.variable_scope(scope): - x = inputs - for i in range(n_conv_layers): - x = conv1d_dp( - x, - filters, - kernel_size, - is_training, - activation=tf.nn.relu, - dropout=True, - dropoutrate=0.1, - mask=mask, - scope='conv1d_{}'.format(i)) - - x = tf.layers.dense(x, units=1) - x = tf.nn.relu(x) - return x - - -def conv_prenet(inputs, - n_conv_layers, - filters, - kernel_size, - is_training, - mask=None, - scope='conv_prenet'): - x = inputs - with tf.variable_scope(scope): - for i in range(n_conv_layers): - x = conv1d( - x, - filters, - kernel_size, - is_training, - activation=tf.nn.relu, - dropout=True, - mask=mask, - scope='conv1d_{}'.format(i)) - - return x diff --git a/modelscope/models/audio/tts/models/compat.py b/modelscope/models/audio/tts/models/compat.py deleted file mode 100755 index bb810841..00000000 --- a/modelscope/models/audio/tts/models/compat.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Functions for compatibility with different TensorFlow versions.""" - -import tensorflow as tf - - -def is_tf2(): - """Returns ``True`` if running TensorFlow 2.0.""" - return tf.__version__.startswith('2') - - -def tf_supports(symbol): - """Returns ``True`` if TensorFlow defines :obj:`symbol`.""" - return _string_to_tf_symbol(symbol) is not None - - -def tf_any(*symbols): - """Returns the first supported symbol.""" - for symbol in symbols: - module = _string_to_tf_symbol(symbol) - if module is not None: - return module - return None - - -def tf_compat(v2=None, v1=None): # pylint: disable=invalid-name - """Returns the compatible symbol based on the current TensorFlow version. - - Args: - v2: The candidate v2 symbol name. - v1: The candidate v1 symbol name. - - Returns: - A TensorFlow symbol. - - Raises: - ValueError: if no symbol can be found. - """ - candidates = [] - if v2 is not None: - candidates.append(v2) - if v1 is not None: - candidates.append(v1) - candidates.append('compat.v1.%s' % v1) - symbol = tf_any(*candidates) - if symbol is None: - raise ValueError('Failure to resolve the TensorFlow symbol') - return symbol - - -def name_from_variable_scope(name=''): - """Creates a name prefixed by the current variable scope.""" - var_scope = tf_compat(v1='get_variable_scope')().name - compat_name = '' - if name: - compat_name = '%s/' % name - if var_scope: - compat_name = '%s/%s' % (var_scope, compat_name) - return compat_name - - -def reuse(): - """Returns ``True`` if the current variable scope is marked for reuse.""" - return tf_compat(v1='get_variable_scope')().reuse - - -def _string_to_tf_symbol(symbol): - modules = symbol.split('.') - namespace = tf - for module in modules: - namespace = getattr(namespace, module, None) - if namespace is None: - return None - return namespace - - -# pylint: disable=invalid-name -gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy') -gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists') -gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile') -is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor') -logging = tf_compat(v1='logging') -nest = tf_compat(v2='nest', v1='contrib.framework.nest') diff --git a/modelscope/models/audio/tts/text/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from modelscope/models/audio/tts/text/__init__.py rename to modelscope/models/audio/tts/models/datasets/__init__.py diff --git a/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py new file mode 100644 index 00000000..cc47d0c4 --- /dev/null +++ b/modelscope/models/audio/tts/models/datasets/kantts_data4fs.py @@ -0,0 +1,238 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os + +import json +import numpy as np +import torch +from torch.utils.data import Dataset +from tqdm import tqdm + +from modelscope.utils.logger import get_logger +from .units import KanTtsLinguisticUnit + +logger = get_logger() + + +class KanTtsText2MelDataset(Dataset): + + def __init__(self, metadata_filename, config_filename, cache=False): + super(KanTtsText2MelDataset, self).__init__() + + self.cache = cache + + with open(config_filename) as f: + self._config = json.loads(f.read()) + + # Load metadata: + self._datadir = os.path.dirname(metadata_filename) + with open(metadata_filename, encoding='utf-8') as f: + self._metadata = [line.strip().split('|') for line in f] + self._length_lst = [int(x[2]) for x in self._metadata] + hours = sum( + self._length_lst) * self._config['audio']['frame_shift_ms'] / ( + 3600 * 1000) + + logger.info('Loaded metadata for %d examples (%.2f hours)' % + (len(self._metadata), hours)) + logger.info('Minimum length: %d, Maximum length: %d' % + (min(self._length_lst), max(self._length_lst))) + + self.ling_unit = KanTtsLinguisticUnit(config_filename) + self.pad_executor = KanTtsText2MelPad() + + self.r = self._config['am']['outputs_per_step'] + self.num_mels = self._config['am']['num_mels'] + + if 'adv' in self._config: + self.feat_window = self._config['adv']['random_window'] + else: + self.feat_window = None + logger.info(self.feat_window) + + self.data_cache = [ + self.cache_load(i) for i in tqdm(range(self.__len__())) + ] if self.cache else [] + + def get_frames_lst(self): + return self._length_lst + + def __getitem__(self, index): + if self.cache: + sample = self.data_cache[index] + return sample + + return self.cache_load(index) + + def cache_load(self, index): + sample = {} + + meta = self._metadata[index] + + sample['utt_id'] = meta[0] + + sample['mel_target'] = np.load(os.path.join( + self._datadir, meta[1]))[:, :self.num_mels] + sample['output_length'] = len(sample['mel_target']) + + lfeat_symbol = meta[3] + sample['ling'] = self.ling_unit.encode_symbol_sequence(lfeat_symbol) + + sample['duration'] = np.load(os.path.join(self._datadir, meta[4])) + + sample['pitch_contour'] = np.load(os.path.join(self._datadir, meta[5])) + + sample['energy_contour'] = np.load( + os.path.join(self._datadir, meta[6])) + + return sample + + def __len__(self): + return len(self._metadata) + + def collate_fn(self, batch): + data_dict = {} + + max_input_length = max((len(x['ling'][0]) for x in batch)) + + # pure linguistic info: sy|tone|syllable_flag|word_segment + + # sy + lfeat_type = self.ling_unit._lfeat_type_list[0] + inputs_sy = self.pad_executor._prepare_scalar_inputs( + [x['ling'][0] for x in batch], max_input_length, + self.ling_unit._sub_unit_pad[lfeat_type]).long() + # tone + lfeat_type = self.ling_unit._lfeat_type_list[1] + inputs_tone = self.pad_executor._prepare_scalar_inputs( + [x['ling'][1] for x in batch], max_input_length, + self.ling_unit._sub_unit_pad[lfeat_type]).long() + + # syllable_flag + lfeat_type = self.ling_unit._lfeat_type_list[2] + inputs_syllable_flag = self.pad_executor._prepare_scalar_inputs( + [x['ling'][2] for x in batch], max_input_length, + self.ling_unit._sub_unit_pad[lfeat_type]).long() + + # word_segment + lfeat_type = self.ling_unit._lfeat_type_list[3] + inputs_ws = self.pad_executor._prepare_scalar_inputs( + [x['ling'][3] for x in batch], max_input_length, + self.ling_unit._sub_unit_pad[lfeat_type]).long() + + # emotion category + lfeat_type = self.ling_unit._lfeat_type_list[4] + data_dict['input_emotions'] = self.pad_executor._prepare_scalar_inputs( + [x['ling'][4] for x in batch], max_input_length, + self.ling_unit._sub_unit_pad[lfeat_type]).long() + + # speaker category + lfeat_type = self.ling_unit._lfeat_type_list[5] + data_dict['input_speakers'] = self.pad_executor._prepare_scalar_inputs( + [x['ling'][5] for x in batch], max_input_length, + self.ling_unit._sub_unit_pad[lfeat_type]).long() + + data_dict['input_lings'] = torch.stack( + [inputs_sy, inputs_tone, inputs_syllable_flag, inputs_ws], dim=2) + + data_dict['valid_input_lengths'] = torch.as_tensor( + [len(x['ling'][0]) - 1 for x in batch], dtype=torch.long + ) # There is one '~' in the last of symbol sequence. We put length-1 for calculation. + + data_dict['valid_output_lengths'] = torch.as_tensor( + [x['output_length'] for x in batch], dtype=torch.long) + max_output_length = torch.max(data_dict['valid_output_lengths']).item() + max_output_round_length = self.pad_executor._round_up( + max_output_length, self.r) + + if self.feat_window is not None: + active_feat_len = np.minimum(max_output_round_length, + self.feat_window) + if active_feat_len < self.feat_window: + max_output_round_length = self.pad_executor._round_up( + self.feat_window, self.r) + active_feat_len = self.feat_window + + max_offsets = [x['output_length'] - active_feat_len for x in batch] + feat_offsets = [ + np.random.randint(0, np.maximum(1, offset)) + for offset in max_offsets + ] + feat_offsets = torch.from_numpy( + np.asarray(feat_offsets, dtype=np.int32)).long() + data_dict['feat_offsets'] = feat_offsets + + data_dict['mel_targets'] = self.pad_executor._prepare_targets( + [x['mel_target'] for x in batch], max_output_round_length, 0.0) + data_dict['durations'] = self.pad_executor._prepare_durations( + [x['duration'] for x in batch], max_input_length, + max_output_round_length) + + data_dict['pitch_contours'] = self.pad_executor._prepare_scalar_inputs( + [x['pitch_contour'] for x in batch], max_input_length, + 0.0).float() + data_dict[ + 'energy_contours'] = self.pad_executor._prepare_scalar_inputs( + [x['energy_contour'] for x in batch], max_input_length, + 0.0).float() + + data_dict['utt_ids'] = [x['utt_id'] for x in batch] + + return data_dict + + +class KanTtsText2MelPad(object): + + def __init__(self): + super(KanTtsText2MelPad, self).__init__() + pass + + def _pad1D(self, x, length, pad): + return np.pad( + x, (0, length - x.shape[0]), mode='constant', constant_values=pad) + + def _pad2D(self, x, length, pad): + return np.pad( + x, [(0, length - x.shape[0]), (0, 0)], + mode='constant', + constant_values=pad) + + def _pad_durations(self, duration, max_in_len, max_out_len): + framenum = np.sum(duration) + symbolnum = duration.shape[0] + if framenum < max_out_len: + padframenum = max_out_len - framenum + duration = np.insert( + duration, symbolnum, values=padframenum, axis=0) + duration = np.insert( + duration, + symbolnum + 1, + values=[0] * (max_in_len - symbolnum - 1), + axis=0) + else: + if symbolnum < max_in_len: + duration = np.insert( + duration, + symbolnum, + values=[0] * (max_in_len - symbolnum), + axis=0) + return duration + + def _round_up(self, x, multiple): + remainder = x % multiple + return x if remainder == 0 else x + multiple - remainder + + def _prepare_scalar_inputs(self, inputs, max_len, pad): + return torch.from_numpy( + np.stack([self._pad1D(x, max_len, pad) for x in inputs])) + + def _prepare_targets(self, targets, max_len, pad): + return torch.from_numpy( + np.stack([self._pad2D(t, max_len, pad) for t in targets])).float() + + def _prepare_durations(self, durations, max_in_len, max_out_len): + return torch.from_numpy( + np.stack([ + self._pad_durations(t, max_in_len, max_out_len) + for t in durations + ])).long() diff --git a/modelscope/models/audio/tts/models/datasets/samplers.py b/modelscope/models/audio/tts/models/datasets/samplers.py new file mode 100644 index 00000000..0657fa8a --- /dev/null +++ b/modelscope/models/audio/tts/models/datasets/samplers.py @@ -0,0 +1,131 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import math +import random + +import torch +from torch import distributed as dist +from torch.utils.data import Sampler + + +class LenSortGroupPoolSampler(Sampler): + + def __init__(self, data_source, length_lst, group_size): + super(LenSortGroupPoolSampler, self).__init__(data_source) + + self.data_source = data_source + self.length_lst = length_lst + self.group_size = group_size + + self.num = len(self.length_lst) + self.buckets = self.num // group_size + + def __iter__(self): + + def getkey(item): + return item[1] + + random_lst = torch.randperm(self.num).tolist() + random_len_lst = [(i, self.length_lst[i]) for i in random_lst] + + # Bucket examples based on similar output sequence length for efficiency: + groups = [ + random_len_lst[i:i + self.group_size] + for i in range(0, self.num, self.group_size) + ] + if (self.num % self.group_size): + groups.append(random_len_lst[self.buckets * self.group_size:-1]) + + indices = [] + + for group in groups: + group.sort(key=getkey, reverse=True) + for item in group: + indices.append(item[0]) + + return iter(indices) + + def __len__(self): + return len(self.data_source) + + +class DistributedLenSortGroupPoolSampler(Sampler): + + def __init__(self, + dataset, + length_lst, + group_size, + num_replicas=None, + rank=None, + shuffle=True): + super(DistributedLenSortGroupPoolSampler, self).__init__(dataset) + + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError( + 'modelscope error: Requires distributed package to be available' + ) + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError( + 'modelscope error: Requires distributed package to be available' + ) + rank = dist.get_rank() + self.dataset = dataset + self.length_lst = length_lst + self.group_size = group_size + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.num_samples = int( + math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + self.buckets = self.num_samples // group_size + self.shuffle = shuffle + + def __iter__(self): + + def getkey(item): + return item[1] + + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + if self.shuffle: + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = list(range(len(self.dataset))) + + # add extra samples to make it evenly divisible + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank:self.total_size:self.num_replicas] + assert len(indices) == self.num_samples + + random_len_lst = [(i, self.length_lst[i]) for i in indices] + + # Bucket examples based on similar output sequence length for efficiency: + groups = [ + random_len_lst[i:i + self.group_size] + for i in range(0, self.num_samples, self.group_size) + ] + if (self.num_samples % self.group_size): + groups.append(random_len_lst[self.buckets * self.group_size:-1]) + + new_indices = [] + + for group in groups: + group.sort(key=getkey, reverse=True) + for item in group: + new_indices.append(item[0]) + + return iter(new_indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/modelscope/models/audio/tts/models/datasets/units/__init__.py b/modelscope/models/audio/tts/models/datasets/units/__init__.py new file mode 100644 index 00000000..4d03df04 --- /dev/null +++ b/modelscope/models/audio/tts/models/datasets/units/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from .ling_unit import * # noqa F403 diff --git a/modelscope/models/audio/tts/models/datasets/units/cleaners.py b/modelscope/models/audio/tts/models/datasets/units/cleaners.py new file mode 100644 index 00000000..07d4fbdb --- /dev/null +++ b/modelscope/models/audio/tts/models/datasets/units/cleaners.py @@ -0,0 +1,88 @@ +# from https://github.com/keithito/tacotron +# Cleaners are transformations that run over the input text at both training and eval time. +# +# Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +# hyperparameter. Some cleaners are English-specific. You'll typically want to use: +# 1. "english_cleaners" for English text +# 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using +# the Unidecode library (https://pypi.python.org/pypi/Unidecode) +# 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update +# the symbols in symbols.py to match your data). + +import re + +from unidecode import unidecode + +from .numbers import normalize_numbers + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r'\s+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [ + (re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) + for x in [('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), ]] # yapf:disable + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text diff --git a/modelscope/models/audio/tts/models/datasets/units/ling_unit.py b/modelscope/models/audio/tts/models/datasets/units/ling_unit.py new file mode 100644 index 00000000..3c211cc7 --- /dev/null +++ b/modelscope/models/audio/tts/models/datasets/units/ling_unit.py @@ -0,0 +1,395 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import abc +import codecs +import os +import re +import shutil + +import json +import numpy as np + +from . import cleaners as cleaners + +# Regular expression matching text enclosed in curly braces: +_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') + + +def _clean_text(text, cleaner_names): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception( + 'modelscope error: configuration cleaner unknown: %s' % name) + text = cleaner(text) + return text + + +class LinguisticBaseUnit(abc.ABC): + + def set_config_params(self, config_params): + self.config_params = config_params + + def save(self, config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) + + +class KanTtsLinguisticUnit(LinguisticBaseUnit): + + def __init__(self, config, path, has_mask=True): + super(KanTtsLinguisticUnit, self).__init__() + + # special symbol + self._pad = '_' + self._eos = '~' + self._mask = '@[MASK]' + self._has_mask = has_mask + self._unit_config = config + self._path = path + + self._cleaner_names = [ + x.strip() for x in self._unit_config['cleaners'].split(',') + ] + self._lfeat_type_list = self._unit_config['lfeat_type_list'].strip( + ).split(',') + + self.build() + + def get_unit_size(self): + ling_unit_size = {} + ling_unit_size['sy'] = len(self.sy) + ling_unit_size['tone'] = len(self.tone) + ling_unit_size['syllable_flag'] = len(self.syllable_flag) + ling_unit_size['word_segment'] = len(self.word_segment) + + if 'emo_category' in self._lfeat_type_list: + ling_unit_size['emotion'] = len(self.emo_category) + if 'speaker_category' in self._lfeat_type_list: + ling_unit_size['speaker'] = len(self.speaker) + + return ling_unit_size + + def build(self): + + self._sub_unit_dim = {} + self._sub_unit_pad = {} + # sy sub-unit + _characters = '' + + _ch_symbols = [] + + sy_path = os.path.join(self._path, self._unit_config['sy']) + f = codecs.open(sy_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_symbols.append(line) + + _arpabet = ['@' + s for s in _ch_symbols] + + # Export all symbols: + self.sy = list(_characters) + _arpabet + [self._pad, self._eos] + if self._has_mask: + self.sy.append(self._mask) + self._sy_to_id = {s: i for i, s in enumerate(self.sy)} + self._id_to_sy = {i: s for i, s in enumerate(self.sy)} + self._sub_unit_dim['sy'] = len(self.sy) + self._sub_unit_pad['sy'] = self._sy_to_id['_'] + + # tone sub-unit + _characters = '' + + _ch_tones = [] + + tone_path = os.path.join(self._path, self._unit_config['tone']) + f = codecs.open(tone_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_tones.append(line) + + # Export all tones: + self.tone = list(_characters) + _ch_tones + [self._pad, self._eos] + if self._has_mask: + self.tone.append(self._mask) + self._tone_to_id = {s: i for i, s in enumerate(self.tone)} + self._id_to_tone = {i: s for i, s in enumerate(self.tone)} + self._sub_unit_dim['tone'] = len(self.tone) + self._sub_unit_pad['tone'] = self._tone_to_id['_'] + + # syllable flag sub-unit + _characters = '' + + _ch_syllable_flags = [] + + sy_flag_path = os.path.join(self._path, + self._unit_config['syllable_flag']) + f = codecs.open(sy_flag_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_syllable_flags.append(line) + + # Export all syllable_flags: + self.syllable_flag = list(_characters) + _ch_syllable_flags + [ + self._pad, self._eos + ] + if self._has_mask: + self.syllable_flag.append(self._mask) + self._syllable_flag_to_id = { + s: i + for i, s in enumerate(self.syllable_flag) + } + self._id_to_syllable_flag = { + i: s + for i, s in enumerate(self.syllable_flag) + } + self._sub_unit_dim['syllable_flag'] = len(self.syllable_flag) + self._sub_unit_pad['syllable_flag'] = self._syllable_flag_to_id['_'] + + # word segment sub-unit + _characters = '' + + _ch_word_segments = [] + + ws_path = os.path.join(self._path, self._unit_config['word_segment']) + f = codecs.open(ws_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_word_segments.append(line) + + # Export all syllable_flags: + self.word_segment = list(_characters) + _ch_word_segments + [ + self._pad, self._eos + ] + if self._has_mask: + self.word_segment.append(self._mask) + self._word_segment_to_id = { + s: i + for i, s in enumerate(self.word_segment) + } + self._id_to_word_segment = { + i: s + for i, s in enumerate(self.word_segment) + } + self._sub_unit_dim['word_segment'] = len(self.word_segment) + self._sub_unit_pad['word_segment'] = self._word_segment_to_id['_'] + + if 'emo_category' in self._lfeat_type_list: + # emotion category sub-unit + _characters = '' + + _ch_emo_types = [] + + emo_path = os.path.join(self._path, + self._unit_config['emo_category']) + f = codecs.open(emo_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_emo_types.append(line) + + self.emo_category = list(_characters) + _ch_emo_types + [ + self._pad, self._eos + ] + if self._has_mask: + self.emo_category.append(self._mask) + self._emo_category_to_id = { + s: i + for i, s in enumerate(self.emo_category) + } + self._id_to_emo_category = { + i: s + for i, s in enumerate(self.emo_category) + } + self._sub_unit_dim['emo_category'] = len(self.emo_category) + self._sub_unit_pad['emo_category'] = self._emo_category_to_id['_'] + + if 'speaker_category' in self._lfeat_type_list: + # speaker category sub-unit + _characters = '' + + _ch_speakers = [] + + speaker_path = os.path.join(self._path, + self._unit_config['speaker_category']) + f = codecs.open(speaker_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_speakers.append(line) + + # Export all syllable_flags: + self.speaker = list(_characters) + _ch_speakers + [ + self._pad, self._eos + ] + if self._has_mask: + self.speaker.append(self._mask) + self._speaker_to_id = {s: i for i, s in enumerate(self.speaker)} + self._id_to_speaker = {i: s for i, s in enumerate(self.speaker)} + self._sub_unit_dim['speaker_category'] = len(self._speaker_to_id) + self._sub_unit_pad['speaker_category'] = self._speaker_to_id['_'] + + def encode_symbol_sequence(self, lfeat_symbol): + lfeat_symbol = lfeat_symbol.strip().split(' ') + + lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list)) + for this_lfeat_symbol in lfeat_symbol: + this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split( + '$') + index = 0 + while index < len(lfeat_symbol_separate): + lfeat_symbol_separate[index] = lfeat_symbol_separate[ + index] + this_lfeat_symbol[index] + ' ' + index = index + 1 + + input_and_label_data = [] + index = 0 + while index < len(self._lfeat_type_list): + sequence = self.encode_sub_unit( + lfeat_symbol_separate[index].strip(), + self._lfeat_type_list[index]) + sequence_array = np.asarray(sequence, dtype=np.int32) + input_and_label_data.append(sequence_array) + index = index + 1 + + return input_and_label_data + + def decode_symbol_sequence(self, sequence): + result = [] + for i, lfeat_type in enumerate(self._lfeat_type_list): + s = '' + sequence_item = sequence[i].tolist() + if lfeat_type == 'sy': + s = self.decode_sy(sequence_item) + elif lfeat_type == 'tone': + s = self.decode_tone(sequence_item) + elif lfeat_type == 'syllable_flag': + s = self.decode_syllable_flag(sequence_item) + elif lfeat_type == 'word_segment': + s = self.decode_word_segment(sequence_item) + elif lfeat_type == 'emo_category': + s = self.decode_emo_category(sequence_item) + elif lfeat_type == 'speaker_category': + s = self.decode_speaker_category(sequence_item) + else: + raise Exception( + 'modelscope error: configuration lfeat type(%s) unknown.' + % lfeat_type) + result.append('%s:%s' % (lfeat_type, s)) + + return result + + def encode_sub_unit(self, this_lfeat_symbol, lfeat_type): + sequence = [] + if lfeat_type == 'sy': + this_lfeat_symbol = this_lfeat_symbol.strip().split(' ') + this_lfeat_symbol_format = '' + index = 0 + while index < len(this_lfeat_symbol): + this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[ + index] + '}' + ' ' + index = index + 1 + sequence = self.encode_text(this_lfeat_symbol_format, + self._cleaner_names) + elif lfeat_type == 'tone': + sequence = self.encode_tone(this_lfeat_symbol) + elif lfeat_type == 'syllable_flag': + sequence = self.encode_syllable_flag(this_lfeat_symbol) + elif lfeat_type == 'word_segment': + sequence = self.encode_word_segment(this_lfeat_symbol) + elif lfeat_type == 'emo_category': + sequence = self.encode_emo_category(this_lfeat_symbol) + elif lfeat_type == 'speaker_category': + sequence = self.encode_speaker_category(this_lfeat_symbol) + else: + raise Exception( + 'modelscope error: configuration lfeat type(%s) unknown.' + % lfeat_type) + + return sequence + + def encode_text(self, text, cleaner_names): + sequence = [] + + # Check for curly braces and treat their contents as ARPAbet: + while len(text): + m = _curly_re.match(text) + if not m: + sequence += self.encode_sy(_clean_text(text, cleaner_names)) + break + sequence += self.encode_sy(_clean_text(m.group(1), cleaner_names)) + sequence += self.encode_arpanet(m.group(2)) + text = m.group(3) + + # Append EOS token + sequence.append(self._sy_to_id['~']) + return sequence + + def encode_sy(self, sy): + return [self._sy_to_id[s] for s in sy if self.should_keep_sy(s)] + + def decode_sy(self, id): + s = self._id_to_sy[id] + if len(s) > 1 and s[0] == '@': + s = s[1:] + return s + + def should_keep_sy(self, s): + return s in self._sy_to_id and s != '_' and s != '~' + + def encode_arpanet(self, text): + return self.encode_sy(['@' + s for s in text.split()]) + + def encode_tone(self, tone): + tones = tone.strip().split(' ') + sequence = [] + for this_tone in tones: + sequence.append(self._tone_to_id[this_tone]) + sequence.append(self._tone_to_id['~']) + return sequence + + def decode_tone(self, id): + return self._id_to_tone[id] + + def encode_syllable_flag(self, syllable_flag): + syllable_flags = syllable_flag.strip().split(' ') + sequence = [] + for this_syllable_flag in syllable_flags: + sequence.append(self._syllable_flag_to_id[this_syllable_flag]) + sequence.append(self._syllable_flag_to_id['~']) + return sequence + + def decode_syllable_flag(self, id): + return self._id_to_syllable_flag[id] + + def encode_word_segment(self, word_segment): + word_segments = word_segment.strip().split(' ') + sequence = [] + for this_word_segment in word_segments: + sequence.append(self._word_segment_to_id[this_word_segment]) + sequence.append(self._word_segment_to_id['~']) + return sequence + + def decode_word_segment(self, id): + return self._id_to_word_segment[id] + + def encode_emo_category(self, emo_type): + emo_categories = emo_type.strip().split(' ') + sequence = [] + for this_category in emo_categories: + sequence.append(self._emo_category_to_id[this_category]) + sequence.append(self._emo_category_to_id['~']) + return sequence + + def decode_emo_category(self, id): + return self._id_to_emo_category[id] + + def encode_speaker_category(self, speaker): + speakers = speaker.strip().split(' ') + sequence = [] + for this_speaker in speakers: + sequence.append(self._speaker_to_id[this_speaker]) + sequence.append(self._speaker_to_id['~']) + return sequence + + def decode_speaker_category(self, id): + return self._id_to_speaker[id] diff --git a/modelscope/models/audio/tts/text/numbers.py b/modelscope/models/audio/tts/models/datasets/units/numbers.py old mode 100755 new mode 100644 similarity index 94% rename from modelscope/models/audio/tts/text/numbers.py rename to modelscope/models/audio/tts/models/datasets/units/numbers.py index d9453fee..d8835059 --- a/modelscope/models/audio/tts/text/numbers.py +++ b/modelscope/models/audio/tts/models/datasets/units/numbers.py @@ -1,3 +1,6 @@ +# The implementation is adopted from tacotron, +# made publicly available under the MIT License at https://github.com/keithito/tacotron + import re import inflect diff --git a/modelscope/models/audio/tts/models/fsmn.py b/modelscope/models/audio/tts/models/fsmn.py deleted file mode 100755 index 875c27f0..00000000 --- a/modelscope/models/audio/tts/models/fsmn.py +++ /dev/null @@ -1,273 +0,0 @@ -import tensorflow as tf - - -def build_sequence_mask(sequence_length, - maximum_length=None, - dtype=tf.float32): - """Builds the dot product mask. - - Args: - sequence_length: The sequence length. - maximum_length: Optional size of the returned time dimension. Otherwise - it is the maximum of :obj:`sequence_length`. - dtype: The type of the mask tensor. - - Returns: - A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape - ``[batch_size, max_length]``. - """ - mask = tf.sequence_mask( - sequence_length, maxlen=maximum_length, dtype=dtype) - - return mask - - -def norm(inputs): - """Layer normalizes :obj:`inputs`.""" - return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1) - - -def pad_in_time(x, padding_shape): - """Helper function to pad a tensor in the time dimension and retain the static depth dimension. - - Agrs: - x: [Batch, Time, Frequency] - padding_length: padding size of constant value (0) before the time dimension - - return: - padded x - """ - - depth = x.get_shape().as_list()[-1] - x = tf.pad(x, [[0, 0], padding_shape, [0, 0]]) - x.set_shape((None, None, depth)) - - return x - - -def pad_in_time_right(x, padding_length): - """Helper function to pad a tensor in the time dimension and retain the static depth dimension. - - Agrs: - x: [Batch, Time, Frequency] - padding_length: padding size of constant value (0) before the time dimension - - return: - padded x - """ - depth = x.get_shape().as_list()[-1] - x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) - x.set_shape((None, None, depth)) - - return x - - -def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0): - """Implements the Transformer's "Feed Forward" layer. - - .. math:: - - ffn(x) = max(0, x*W_1 + b_1)*W_2 - - Args: - x: The input. - ffn_dim: The number of units of the nonlinear transformation. - memory_units: the number of units of linear transformation - mode: A ``tf.estimator.ModeKeys`` mode. - dropout: The probability to drop units from the inner transformation. - - Returns: - The transformed input. - """ - inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu) - inner = tf.layers.dropout( - inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN) - outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False) - - return outer - - -def drop_and_add(inputs, outputs, mode, dropout=0.0): - """Drops units in the outputs and adds the previous values. - - Args: - inputs: The input of the previous layer. - outputs: The output of the previous layer. - mode: A ``tf.estimator.ModeKeys`` mode. - dropout: The probability to drop units in :obj:`outputs`. - - Returns: - The residual and normalized output. - """ - outputs = tf.layers.dropout(outputs, rate=dropout, training=mode) - - input_dim = inputs.get_shape().as_list()[-1] - output_dim = outputs.get_shape().as_list()[-1] - - if input_dim == output_dim: - outputs += inputs - - return outputs - - -def MemoryBlock( - inputs, - filter_size, - mode, - mask=None, - dropout=0.0, -): - """ - Define the bidirectional memory block in FSMN - - Agrs: - inputs: The output of the previous layer. [Batch, Time, Frequency] - filter_size: memory block filter size - mode: Training or Evaluation - mask: A ``tf.Tensor`` applied to the memory block output - - return: - output: 3-D tensor ([Batch, Time, Frequency]) - """ - static_shape = inputs.get_shape().as_list() - depth = static_shape[-1] - inputs = tf.expand_dims(inputs, axis=1) # [Batch, 1, Time, Frequency] - depthwise_filter = tf.get_variable( - 'depth_conv_w', - shape=[1, filter_size, depth, 1], - initializer=tf.glorot_uniform_initializer(), - dtype=tf.float32) - memory = tf.nn.depthwise_conv2d( - input=inputs, - filter=depthwise_filter, - strides=[1, 1, 1, 1], - padding='SAME', - rate=[1, 1], - data_format='NHWC') - memory = memory + inputs - output = tf.layers.dropout(memory, rate=dropout, training=mode) - output = tf.reshape( - output, - [tf.shape(output)[0], tf.shape(output)[2], depth]) - if mask is not None: - output = output * tf.expand_dims(mask, -1) - - return output - - -def MemoryBlockV2( - inputs, - filter_size, - mode, - shift=0, - mask=None, - dropout=0.0, -): - """ - Define the bidirectional memory block in FSMN - - Agrs: - inputs: The output of the previous layer. [Batch, Time, Frequency] - filter_size: memory block filter size - mode: Training or Evaluation - shift: left padding, to control delay - mask: A ``tf.Tensor`` applied to the memory block output - - return: - output: 3-D tensor ([Batch, Time, Frequency]) - """ - if mask is not None: - inputs = inputs * tf.expand_dims(mask, -1) - - static_shape = inputs.get_shape().as_list() - depth = static_shape[-1] - # padding - left_padding = int(round((filter_size - 1) / 2)) - right_padding = int((filter_size - 1) / 2) - if shift > 0: - left_padding = left_padding + shift - right_padding = right_padding - shift - pad_inputs = pad_in_time(inputs, [left_padding, right_padding]) - pad_inputs = tf.expand_dims( - pad_inputs, axis=1) # [Batch, 1, Time, Frequency] - depthwise_filter = tf.get_variable( - 'depth_conv_w', - shape=[1, filter_size, depth, 1], - initializer=tf.glorot_uniform_initializer(), - dtype=tf.float32) - memory = tf.nn.depthwise_conv2d( - input=pad_inputs, - filter=depthwise_filter, - strides=[1, 1, 1, 1], - padding='VALID', - rate=[1, 1], - data_format='NHWC') - memory = tf.reshape( - memory, - [tf.shape(memory)[0], tf.shape(memory)[2], depth]) - memory = memory + inputs - output = tf.layers.dropout(memory, rate=dropout, training=mode) - if mask is not None: - output = output * tf.expand_dims(mask, -1) - - return output - - -def UniMemoryBlock( - inputs, - filter_size, - mode, - cache=None, - mask=None, - dropout=0.0, -): - """ - Define the unidirectional memory block in FSMN - - Agrs: - inputs: The output of the previous layer. [Batch, Time, Frequency] - filter_size: memory block filter size - cache: for streaming inference - mode: Training or Evaluation - mask: A ``tf.Tensor`` applied to the memory block output - dropout: dorpout factor - return: - output: 3-D tensor ([Batch, Time, Frequency]) - """ - if cache is not None: - static_shape = cache['queries'].get_shape().as_list() - depth = static_shape[-1] - queries = tf.slice(cache['queries'], [0, 1, 0], [ - tf.shape(cache['queries'])[0], - tf.shape(cache['queries'])[1] - 1, depth - ]) - queries = tf.concat([queries, inputs], axis=1) - cache['queries'] = queries - else: - padding_length = filter_size - 1 - queries = pad_in_time(inputs, [padding_length, 0]) - - queries = tf.expand_dims(queries, axis=1) # [Batch, 1, Time, Frequency] - static_shape = queries.get_shape().as_list() - depth = static_shape[-1] - depthwise_filter = tf.get_variable( - 'depth_conv_w', - shape=[1, filter_size, depth, 1], - initializer=tf.glorot_uniform_initializer(), - dtype=tf.float32) - memory = tf.nn.depthwise_conv2d( - input=queries, - filter=depthwise_filter, - strides=[1, 1, 1, 1], - padding='VALID', - rate=[1, 1], - data_format='NHWC') - memory = tf.reshape( - memory, - [tf.shape(memory)[0], tf.shape(memory)[2], depth]) - memory = memory + inputs - output = tf.layers.dropout(memory, rate=dropout, training=mode) - if mask is not None: - output = output * tf.expand_dims(mask, -1) - - return output diff --git a/modelscope/models/audio/tts/models/fsmn_encoder.py b/modelscope/models/audio/tts/models/fsmn_encoder.py deleted file mode 100755 index 2c650624..00000000 --- a/modelscope/models/audio/tts/models/fsmn_encoder.py +++ /dev/null @@ -1,178 +0,0 @@ -import tensorflow as tf - -from . import fsmn - - -class FsmnEncoder(): - """Encoder using Fsmn - """ - - def __init__(self, - filter_size, - fsmn_num_layers, - dnn_num_layers, - num_memory_units=512, - ffn_inner_dim=2048, - dropout=0.0, - position_encoder=None): - """Initializes the parameters of the encoder. - - Args: - filter_size: the total order of memory block - fsmn_num_layers: The number of fsmn layers. - dnn_num_layers: The number of dnn layers - num_units: The number of memory units. - ffn_inner_dim: The number of units of the inner linear transformation - in the feed forward layer. - dropout: The probability to drop units from the outputs. - position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to - apply on inputs or ``None``. - """ - super(FsmnEncoder, self).__init__() - self.filter_size = filter_size - self.fsmn_num_layers = fsmn_num_layers - self.dnn_num_layers = dnn_num_layers - self.num_memory_units = num_memory_units - self.ffn_inner_dim = ffn_inner_dim - self.dropout = dropout - self.position_encoder = position_encoder - - def encode(self, inputs, sequence_length=None, mode=True): - if self.position_encoder is not None: - inputs = self.position_encoder(inputs) - - inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) - - mask = fsmn.build_sequence_mask( - sequence_length, maximum_length=tf.shape(inputs)[1]) - - state = () - - for layer in range(self.fsmn_num_layers): - with tf.variable_scope('fsmn_layer_{}'.format(layer)): - with tf.variable_scope('ffn'): - context = fsmn.feed_forward( - inputs, - self.ffn_inner_dim, - self.num_memory_units, - mode, - dropout=self.dropout) - - with tf.variable_scope('memory'): - memory = fsmn.MemoryBlock( - context, - self.filter_size, - mode, - mask=mask, - dropout=self.dropout) - - memory = fsmn.drop_and_add( - inputs, memory, mode, dropout=self.dropout) - - inputs = memory - state += (tf.reduce_mean(inputs, axis=1), ) - - for layer in range(self.dnn_num_layers): - with tf.variable_scope('dnn_layer_{}'.format(layer)): - transformed = fsmn.feed_forward( - inputs, - self.ffn_inner_dim, - self.num_memory_units, - mode, - dropout=self.dropout) - - inputs = transformed - state += (tf.reduce_mean(inputs, axis=1), ) - - outputs = inputs - return (outputs, state, sequence_length) - - -class FsmnEncoderV2(): - """Encoder using Fsmn - """ - - def __init__(self, - filter_size, - fsmn_num_layers, - dnn_num_layers, - num_memory_units=512, - ffn_inner_dim=2048, - dropout=0.0, - shift=0, - position_encoder=None): - """Initializes the parameters of the encoder. - - Args: - filter_size: the total order of memory block - fsmn_num_layers: The number of fsmn layers. - dnn_num_layers: The number of dnn layers - num_units: The number of memory units. - ffn_inner_dim: The number of units of the inner linear transformation - in the feed forward layer. - dropout: The probability to drop units from the outputs. - shift: left padding, to control delay - position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to - apply on inputs or ``None``. - """ - super(FsmnEncoderV2, self).__init__() - self.filter_size = filter_size - self.fsmn_num_layers = fsmn_num_layers - self.dnn_num_layers = dnn_num_layers - self.num_memory_units = num_memory_units - self.ffn_inner_dim = ffn_inner_dim - self.dropout = dropout - self.shift = shift - if not isinstance(shift, list): - self.shift = [shift for _ in range(self.fsmn_num_layers)] - self.position_encoder = position_encoder - - def encode(self, inputs, sequence_length=None, mode=True): - if self.position_encoder is not None: - inputs = self.position_encoder(inputs) - - inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) - - mask = fsmn.build_sequence_mask( - sequence_length, maximum_length=tf.shape(inputs)[1]) - - state = () - for layer in range(self.fsmn_num_layers): - with tf.variable_scope('fsmn_layer_{}'.format(layer)): - with tf.variable_scope('ffn'): - context = fsmn.feed_forward( - inputs, - self.ffn_inner_dim, - self.num_memory_units, - mode, - dropout=self.dropout) - - with tf.variable_scope('memory'): - memory = fsmn.MemoryBlockV2( - context, - self.filter_size, - mode, - shift=self.shift[layer], - mask=mask, - dropout=self.dropout) - - memory = fsmn.drop_and_add( - inputs, memory, mode, dropout=self.dropout) - - inputs = memory - state += (tf.reduce_mean(inputs, axis=1), ) - - for layer in range(self.dnn_num_layers): - with tf.variable_scope('dnn_layer_{}'.format(layer)): - transformed = fsmn.feed_forward( - inputs, - self.ffn_inner_dim, - self.num_memory_units, - mode, - dropout=self.dropout) - - inputs = transformed - state += (tf.reduce_mean(inputs, axis=1), ) - - outputs = inputs - return (outputs, state, sequence_length) diff --git a/modelscope/models/audio/tts/models/helpers.py b/modelscope/models/audio/tts/models/helpers.py deleted file mode 100755 index 371000a4..00000000 --- a/modelscope/models/audio/tts/models/helpers.py +++ /dev/null @@ -1,159 +0,0 @@ -import numpy as np -import tensorflow as tf - - -class VarTestHelper(tf.contrib.seq2seq.Helper): - - def __init__(self, batch_size, inputs, dim): - with tf.name_scope('VarTestHelper'): - self._batch_size = batch_size - self._inputs = inputs - self._dim = dim - - num_steps = tf.shape(self._inputs)[1] - self._lengths = tf.tile([num_steps], [self._batch_size]) - - self._inputs = tf.roll(inputs, shift=-1, axis=1) - self._init_inputs = inputs[:, 0, :] - - @property - def batch_size(self): - return self._batch_size - - @property - def sample_ids_shape(self): - return tf.TensorShape([]) - - @property - def sample_ids_dtype(self): - return np.int32 - - def initialize(self, name=None): - return (tf.tile([False], [self._batch_size]), - _go_frames(self._batch_size, self._dim, self._init_inputs)) - - def sample(self, time, outputs, state, name=None): - return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them - - def next_inputs(self, time, outputs, state, sample_ids, name=None): - with tf.name_scope('VarTestHelper'): - finished = (time + 1 >= self._lengths) - next_inputs = tf.concat([outputs, self._inputs[:, time, :]], - axis=-1) - return (finished, next_inputs, state) - - -class VarTrainingHelper(tf.contrib.seq2seq.Helper): - - def __init__(self, targets, inputs, dim): - with tf.name_scope('VarTrainingHelper'): - self._targets = targets # [N, T_in, 1] - self._batch_size = tf.shape(inputs)[0] # N - self._inputs = inputs - self._dim = dim - - num_steps = tf.shape(self._targets)[1] - self._lengths = tf.tile([num_steps], [self._batch_size]) - - self._inputs = tf.roll(inputs, shift=-1, axis=1) - self._init_inputs = inputs[:, 0, :] - - @property - def batch_size(self): - return self._batch_size - - @property - def sample_ids_shape(self): - return tf.TensorShape([]) - - @property - def sample_ids_dtype(self): - return np.int32 - - def initialize(self, name=None): - return (tf.tile([False], [self._batch_size]), - _go_frames(self._batch_size, self._dim, self._init_inputs)) - - def sample(self, time, outputs, state, name=None): - return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them - - def next_inputs(self, time, outputs, state, sample_ids, name=None): - with tf.name_scope(name or 'VarTrainingHelper'): - finished = (time + 1 >= self._lengths) - next_inputs = tf.concat( - [self._targets[:, time, :], self._inputs[:, time, :]], axis=-1) - return (finished, next_inputs, state) - - -class VarTrainingSSHelper(tf.contrib.seq2seq.Helper): - - def __init__(self, targets, inputs, dim, global_step, schedule_begin, - alpha, decay_steps): - with tf.name_scope('VarTrainingSSHelper'): - self._targets = targets # [N, T_in, 1] - self._batch_size = tf.shape(inputs)[0] # N - self._inputs = inputs - self._dim = dim - - num_steps = tf.shape(self._targets)[1] - self._lengths = tf.tile([num_steps], [self._batch_size]) - - self._inputs = tf.roll(inputs, shift=-1, axis=1) - self._init_inputs = inputs[:, 0, :] - - # for schedule sampling - self._global_step = global_step - self._schedule_begin = schedule_begin - self._alpha = alpha - self._decay_steps = decay_steps - - @property - def batch_size(self): - return self._batch_size - - @property - def sample_ids_shape(self): - return tf.TensorShape([]) - - @property - def sample_ids_dtype(self): - return np.int32 - - def initialize(self, name=None): - self._ratio = _tf_decay(self._global_step, self._schedule_begin, - self._alpha, self._decay_steps) - return (tf.tile([False], [self._batch_size]), - _go_frames(self._batch_size, self._dim, self._init_inputs)) - - def sample(self, time, outputs, state, name=None): - return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them - - def next_inputs(self, time, outputs, state, sample_ids, name=None): - with tf.name_scope(name or 'VarTrainingHelper'): - finished = (time + 1 >= self._lengths) - next_inputs_tmp = tf.cond( - tf.less( - tf.random_uniform([], minval=0, maxval=1, - dtype=tf.float32), self._ratio), - lambda: self._targets[:, time, :], lambda: outputs) - next_inputs = tf.concat( - [next_inputs_tmp, self._inputs[:, time, :]], axis=-1) - return (finished, next_inputs, state) - - -def _go_frames(batch_size, dim, init_inputs): - '''Returns all-zero frames for a given batch size and output dimension''' - return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs], - axis=-1) - - -def _tf_decay(global_step, schedule_begin, alpha, decay_steps): - tfr = tf.train.exponential_decay( - 1.0, - global_step=global_step - schedule_begin, - decay_steps=decay_steps, - decay_rate=alpha, - name='tfr_decay') - final_tfr = tf.cond( - tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr) - return final_tfr diff --git a/modelscope/models/audio/tts/models/models/__init__.py b/modelscope/models/audio/tts/models/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/audio/tts/models/models/hifigan/__init__.py b/modelscope/models/audio/tts/models/models/hifigan/__init__.py new file mode 100644 index 00000000..ae9d10ea --- /dev/null +++ b/modelscope/models/audio/tts/models/models/hifigan/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from .hifigan import * # noqa F403 diff --git a/modelscope/models/audio/tts/models/models/hifigan/hifigan.py b/modelscope/models/audio/tts/models/models/hifigan/hifigan.py new file mode 100755 index 00000000..0f950539 --- /dev/null +++ b/modelscope/models/audio/tts/models/models/hifigan/hifigan.py @@ -0,0 +1,238 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Part of the implementation is borrowed from https://github.com/jik876/hifi-gan + +from distutils.version import LooseVersion + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from modelscope.models.audio.tts.models.utils import get_padding, init_weights +from modelscope.utils.logger import get_logger + +logger = get_logger() +is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7') + + +def stft(x, fft_size, hop_size, win_length, window): + """Perform STFT and convert to magnitude spectrogram. + + Args: + x (Tensor): Input signal tensor (B, T). + fft_size (int): FFT size. + hop_size (int): Hop size. + win_length (int): Window length. + window (str): Window function type. + + Returns: + Tensor: Magnitude spectrogram (B). + + """ + if is_pytorch_17plus: + x_stft = torch.stft( + x, fft_size, hop_size, win_length, window, return_complex=False) + else: + x_stft = torch.stft(x, fft_size, hop_size, win_length, window) + real = x_stft[..., 0] + imag = x_stft[..., 1] + + # NOTE(kan-bayashi): clamp is needed to avoid nan or inf + return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) + + +LRELU_SLOPE = 0.1 + + +def get_padding_casual(kernel_size, dilation=1): + return int(kernel_size * dilation - dilation) + + +class Conv1dCasual(torch.nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode='zeros'): + super(Conv1dCasual, self).__init__() + self.pad = padding + self.conv1d = weight_norm( + Conv1d( + in_channels, + out_channels, + kernel_size, + stride, + padding=0, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode=padding_mode)) + self.conv1d.apply(init_weights) + + def forward(self, x): # bdt + # described starting from the last dimension and moving forward. + x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant') + x = self.conv1d(x) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.conv1d) + + +class ConvTranspose1dCausal(torch.nn.Module): + """CausalConvTranspose1d module with customized initialization.""" + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding=0): + """Initialize CausalConvTranspose1d module.""" + super(ConvTranspose1dCausal, self).__init__() + self.deconv = weight_norm( + ConvTranspose1d(in_channels, out_channels, kernel_size, stride)) + self.stride = stride + self.deconv.apply(init_weights) + self.pad = kernel_size - stride + + def forward(self, x): + """Calculate forward propagation. + Args: + x (Tensor): Input tensor (B, in_channels, T_in). + Returns: + Tensor: Output tensor (B, out_channels, T_out). + """ + # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant") + return self.deconv(x)[:, :, :-self.pad] + + def remove_weight_norm(self): + remove_weight_norm(self.deconv) + + +class ResBlock1(torch.nn.Module): + + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList([ + Conv1dCasual( + channels, + channels, + kernel_size, + 1, + dilation=dilation[i], + padding=get_padding_casual(kernel_size, dilation[i])) + for i in range(len(dilation)) + ]) + + self.convs2 = nn.ModuleList([ + Conv1dCasual( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding_casual(kernel_size, 1)) + for i in range(len(dilation)) + ]) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for layer in self.convs1: + layer.remove_weight_norm() + for layer in self.convs2: + layer.remove_weight_norm() + + +class Generator(torch.nn.Module): + + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + logger.info('num_kernels={}, num_upsamples={}'.format( + self.num_kernels, self.num_upsamples)) + self.conv_pre = Conv1dCasual( + 80, h.upsample_initial_channel, 7, 1, padding=7 - 1) + resblock = ResBlock1 if h.resblock == '1' else ResBlock2 + + self.ups = nn.ModuleList() + self.repeat_ups = nn.ModuleList() + for i, (u, k) in enumerate( + zip(h.upsample_rates, h.upsample_kernel_sizes)): + upsample = nn.Sequential( + nn.Upsample(mode='nearest', scale_factor=u), + nn.LeakyReLU(LRELU_SLOPE), + Conv1dCasual( + h.upsample_initial_channel // (2**i), + h.upsample_initial_channel // (2**(i + 1)), + kernel_size=7, + stride=1, + padding=7 - 1)) + self.repeat_ups.append(upsample) + self.ups.append( + ConvTranspose1dCausal( + h.upsample_initial_channel // (2**i), + h.upsample_initial_channel // (2**(i + 1)), + k, + u, + padding=(k - u) // 2)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2**(i + 1)) + for j, (k, d) in enumerate( + zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = torch.sin(x) + x + # transconv + x1 = F.leaky_relu(x, LRELU_SLOPE) + x1 = self.ups[i](x1) + # repeat + x2 = self.repeat_ups[i](x) + x = x1 + x2 + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + logger.info('Removing weight norm...') + for layer in self.ups: + layer.remove_weight_norm() + for layer in self.repeat_ups: + layer[-1].remove_weight_norm() + for layer in self.resblocks: + layer.remove_weight_norm() + self.conv_pre.remove_weight_norm() + self.conv_post.remove_weight_norm() diff --git a/modelscope/models/audio/tts/models/models/sambert/__init__.py b/modelscope/models/audio/tts/models/models/sambert/__init__.py new file mode 100644 index 00000000..f0bf5290 --- /dev/null +++ b/modelscope/models/audio/tts/models/models/sambert/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from .kantts_sambert import * # noqa F403 diff --git a/modelscope/models/audio/tts/models/models/sambert/adaptors.py b/modelscope/models/audio/tts/models/models/sambert/adaptors.py new file mode 100644 index 00000000..c171a1db --- /dev/null +++ b/modelscope/models/audio/tts/models/models/sambert/adaptors.py @@ -0,0 +1,131 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .base import Prenet +from .fsmn import FsmnEncoderV2 + + +class LengthRegulator(nn.Module): + + def __init__(self, r=1): + super(LengthRegulator, self).__init__() + + self.r = r + + def forward(self, inputs, durations, masks=None): + reps = (durations + 0.5).long() + output_lens = reps.sum(dim=1) + max_len = output_lens.max() + reps_cumsum = torch.cumsum( + F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :] + range_ = torch.arange(max_len).to(inputs.device)[None, :, None] + mult = ((reps_cumsum[:, :, :-1] <= range_) + & (reps_cumsum[:, :, 1:] > range_)) # yapf:disable + mult = mult.float() + out = torch.matmul(mult, inputs) + + if masks is not None: + out = out.masked_fill(masks.unsqueeze(-1), 0.0) + + seq_len = out.size(1) + padding = self.r - int(seq_len) % self.r + if (padding < self.r): + out = F.pad( + out.transpose(1, 2), (0, padding, 0, 0, 0, 0), value=0.0) + out = out.transpose(1, 2) + + return out, output_lens + + +class VarRnnARPredictor(nn.Module): + + def __init__(self, cond_units, prenet_units, rnn_units): + super(VarRnnARPredictor, self).__init__() + + self.prenet = Prenet(1, prenet_units) + self.lstm = nn.LSTM( + prenet_units[-1] + cond_units, + rnn_units, + num_layers=2, + batch_first=True, + bidirectional=False) + self.fc = nn.Linear(rnn_units, 1) + + def forward(self, inputs, cond, h=None, masks=None): + x = torch.cat([self.prenet(inputs), cond], dim=-1) + # The input can also be a packed variable length sequence, + # here we just omit it for simplicity due to the mask and uni-directional lstm. + x, h_new = self.lstm(x, h) + + x = self.fc(x).squeeze(-1) + x = F.relu(x) + + if masks is not None: + x = x.masked_fill(masks, 0.0) + + return x, h_new + + def infer(self, cond, masks=None): + batch_size, length = cond.size(0), cond.size(1) + + output = [] + x = torch.zeros((batch_size, 1)).to(cond.device) + h = None + + for i in range(length): + x, h = self.forward(x.unsqueeze(1), cond[:, i:i + 1, :], h=h) + output.append(x) + + output = torch.cat(output, dim=-1) + + if masks is not None: + output = output.masked_fill(masks, 0.0) + + return output + + +class VarFsmnRnnNARPredictor(nn.Module): + + def __init__(self, in_dim, filter_size, fsmn_num_layers, num_memory_units, + ffn_inner_dim, dropout, shift, lstm_units): + super(VarFsmnRnnNARPredictor, self).__init__() + + self.fsmn = FsmnEncoderV2(filter_size, fsmn_num_layers, in_dim, + num_memory_units, ffn_inner_dim, dropout, + shift) + self.blstm = nn.LSTM( + num_memory_units, + lstm_units, + num_layers=1, + batch_first=True, + bidirectional=True) + self.fc = nn.Linear(2 * lstm_units, 1) + + def forward(self, inputs, masks=None): + input_lengths = None + if masks is not None: + input_lengths = torch.sum((~masks).float(), dim=1).long() + + x = self.fsmn(inputs, masks) + + if input_lengths is not None: + x = nn.utils.rnn.pack_padded_sequence( + x, + input_lengths.tolist(), + batch_first=True, + enforce_sorted=False) + x, _ = self.blstm(x) + x, _ = nn.utils.rnn.pad_packed_sequence( + x, batch_first=True, total_length=inputs.size(1)) + else: + x, _ = self.blstm(x) + + x = self.fc(x).squeeze(-1) + + if masks is not None: + x = x.masked_fill(masks, 0.0) + + return x diff --git a/modelscope/models/audio/tts/models/models/sambert/base.py b/modelscope/models/audio/tts/models/models/sambert/base.py new file mode 100644 index 00000000..873aecbf --- /dev/null +++ b/modelscope/models/audio/tts/models/models/sambert/base.py @@ -0,0 +1,369 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ScaledDotProductAttention(nn.Module): + """ Scaled Dot-Product Attention """ + + def __init__(self, temperature, dropatt=0.0): + super().__init__() + self.temperature = temperature + self.softmax = nn.Softmax(dim=2) + self.dropatt = nn.Dropout(dropatt) + + def forward(self, q, k, v, mask=None): + + attn = torch.bmm(q, k.transpose(1, 2)) + attn = attn / self.temperature + + if mask is not None: + attn = attn.masked_fill(mask, -np.inf) + + attn = self.softmax(attn) + attn = self.dropatt(attn) + output = torch.bmm(attn, v) + + return output, attn + + +class Prenet(nn.Module): + + def __init__(self, in_units, prenet_units, out_units=0): + super(Prenet, self).__init__() + + self.fcs = nn.ModuleList() + for in_dim, out_dim in zip([in_units] + prenet_units[:-1], + prenet_units): + self.fcs.append(nn.Linear(in_dim, out_dim)) + self.fcs.append(nn.ReLU()) + self.fcs.append(nn.Dropout(0.5)) + + if (out_units): + self.fcs.append(nn.Linear(prenet_units[-1], out_units)) + + def forward(self, input): + output = input + for layer in self.fcs: + output = layer(output) + return output + + +class MultiHeadSelfAttention(nn.Module): + """ Multi-Head SelfAttention module """ + + def __init__(self, n_head, d_in, d_model, d_head, dropout, dropatt=0.0): + super().__init__() + + self.n_head = n_head + self.d_head = d_head + self.d_in = d_in + self.d_model = d_model + + self.layer_norm = nn.LayerNorm(d_in, eps=1e-6) + self.w_qkv = nn.Linear(d_in, 3 * n_head * d_head) + + self.attention = ScaledDotProductAttention( + temperature=np.power(d_head, 0.5), dropatt=dropatt) + + self.fc = nn.Linear(n_head * d_head, d_model) + + self.dropout = nn.Dropout(dropout) + + def forward(self, input, mask=None): + d_head, n_head = self.d_head, self.n_head + + sz_b, len_in, _ = input.size() + + residual = input + + x = self.layer_norm(input) + qkv = self.w_qkv(x) + q, k, v = qkv.chunk(3, -1) + + q = q.view(sz_b, len_in, n_head, d_head) + k = k.view(sz_b, len_in, n_head, d_head) + v = v.view(sz_b, len_in, n_head, d_head) + + q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_in, + d_head) # (n*b) x l x d + k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_in, + d_head) # (n*b) x l x d + v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_in, + d_head) # (n*b) x l x d + + if mask is not None: + mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. + output, attn = self.attention(q, k, v, mask=mask) + + output = output.view(n_head, sz_b, len_in, d_head) + output = (output.permute(1, 2, 0, + 3).contiguous().view(sz_b, len_in, + -1)) # b x l x (n*d) + + output = self.dropout(self.fc(output)) + if (output.size(-1) == residual.size(-1)): + output = output + residual + + return output, attn + + +class PositionwiseConvFeedForward(nn.Module): + """ A two-feed-forward-layer module """ + + def __init__(self, + d_in, + d_hid, + kernel_size=(3, 1), + dropout_inner=0.1, + dropout=0.1): + super().__init__() + # Use Conv1D + # position-wise + self.w_1 = nn.Conv1d( + d_in, + d_hid, + kernel_size=kernel_size[0], + padding=(kernel_size[0] - 1) // 2, + ) + # position-wise + self.w_2 = nn.Conv1d( + d_hid, + d_in, + kernel_size=kernel_size[1], + padding=(kernel_size[1] - 1) // 2, + ) + + self.layer_norm = nn.LayerNorm(d_in, eps=1e-6) + self.dropout_inner = nn.Dropout(dropout_inner) + self.dropout = nn.Dropout(dropout) + + def forward(self, x, mask=None): + residual = x + x = self.layer_norm(x) + + output = x.transpose(1, 2) + output = F.relu(self.w_1(output)) + if mask is not None: + output = output.masked_fill(mask.unsqueeze(1), 0) + output = self.dropout_inner(output) + output = self.w_2(output) + output = output.transpose(1, 2) + output = self.dropout(output) + + output = output + residual + + return output + + +class FFTBlock(nn.Module): + """FFT Block""" + + def __init__(self, + d_in, + d_model, + n_head, + d_head, + d_inner, + kernel_size, + dropout, + dropout_attn=0.0, + dropout_relu=0.0): + super(FFTBlock, self).__init__() + self.slf_attn = MultiHeadSelfAttention( + n_head, + d_in, + d_model, + d_head, + dropout=dropout, + dropatt=dropout_attn) + self.pos_ffn = PositionwiseConvFeedForward( + d_model, + d_inner, + kernel_size, + dropout_inner=dropout_relu, + dropout=dropout) + + def forward(self, input, mask=None, slf_attn_mask=None): + output, slf_attn = self.slf_attn(input, mask=slf_attn_mask) + if mask is not None: + output = output.masked_fill(mask.unsqueeze(-1), 0) + + output = self.pos_ffn(output, mask=mask) + if mask is not None: + output = output.masked_fill(mask.unsqueeze(-1), 0) + + return output, slf_attn + + +class MultiHeadPNCAAttention(nn.Module): + """ Multi-Head Attention PNCA module """ + + def __init__(self, n_head, d_model, d_mem, d_head, dropout, dropatt=0.0): + super().__init__() + + self.n_head = n_head + self.d_head = d_head + self.d_model = d_model + self.d_mem = d_mem + + self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) + + self.w_x_qkv = nn.Linear(d_model, 3 * n_head * d_head) + self.fc_x = nn.Linear(n_head * d_head, d_model) + + self.w_h_kv = nn.Linear(d_mem, 2 * n_head * d_head) + self.fc_h = nn.Linear(n_head * d_head, d_model) + + self.attention = ScaledDotProductAttention( + temperature=np.power(d_head, 0.5), dropatt=dropatt) + + self.dropout = nn.Dropout(dropout) + + def update_x_state(self, x): + d_head, n_head = self.d_head, self.n_head + + sz_b, len_x, _ = x.size() + + x_qkv = self.w_x_qkv(x) + x_q, x_k, x_v = x_qkv.chunk(3, -1) + + x_q = x_q.view(sz_b, len_x, n_head, d_head) + x_k = x_k.view(sz_b, len_x, n_head, d_head) + x_v = x_v.view(sz_b, len_x, n_head, d_head) + + x_q = x_q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head) + x_k = x_k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head) + x_v = x_v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head) + + if (self.x_state_size): + self.x_k = torch.cat([self.x_k, x_k], dim=1) + self.x_v = torch.cat([self.x_v, x_v], dim=1) + else: + self.x_k = x_k + self.x_v = x_v + + self.x_state_size += len_x + + return x_q, x_k, x_v + + def update_h_state(self, h): + if (self.h_state_size == h.size(1)): + return None, None + + d_head, n_head = self.d_head, self.n_head + + # H + sz_b, len_h, _ = h.size() + + h_kv = self.w_h_kv(h) + h_k, h_v = h_kv.chunk(2, -1) + + h_k = h_k.view(sz_b, len_h, n_head, d_head) + h_v = h_v.view(sz_b, len_h, n_head, d_head) + + self.h_k = h_k.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head) + self.h_v = h_v.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head) + + self.h_state_size += len_h + + return h_k, h_v + + def reset_state(self): + self.h_k = None + self.h_v = None + self.h_state_size = 0 + self.x_k = None + self.x_v = None + self.x_state_size = 0 + + def forward(self, x, h, mask_x=None, mask_h=None): + residual = x + self.update_h_state(h) + x_q, x_k, x_v = self.update_x_state(self.layer_norm(x)) + + d_head, n_head = self.d_head, self.n_head + + sz_b, len_in, _ = x.size() + + # X + if mask_x is not None: + mask_x = mask_x.repeat(n_head, 1, 1) # (n*b) x .. x .. + output_x, attn_x = self.attention(x_q, self.x_k, self.x_v, mask=mask_x) + + output_x = output_x.view(n_head, sz_b, len_in, d_head) + output_x = (output_x.permute(1, 2, 0, + 3).contiguous().view(sz_b, len_in, + -1)) # b x l x (n*d) + output_x = self.fc_x(output_x) + + # H + if mask_h is not None: + mask_h = mask_h.repeat(n_head, 1, 1) + output_h, attn_h = self.attention(x_q, self.h_k, self.h_v, mask=mask_h) + + output_h = output_h.view(n_head, sz_b, len_in, d_head) + output_h = (output_h.permute(1, 2, 0, + 3).contiguous().view(sz_b, len_in, + -1)) # b x l x (n*d) + output_h = self.fc_h(output_h) + + output = output_x + output_h + + output = self.dropout(output) + + output = output + residual + + return output, attn_x, attn_h + + +class PNCABlock(nn.Module): + """PNCA Block""" + + def __init__(self, + d_model, + d_mem, + n_head, + d_head, + d_inner, + kernel_size, + dropout, + dropout_attn=0.0, + dropout_relu=0.0): + super(PNCABlock, self).__init__() + self.pnca_attn = MultiHeadPNCAAttention( + n_head, + d_model, + d_mem, + d_head, + dropout=dropout, + dropatt=dropout_attn) + self.pos_ffn = PositionwiseConvFeedForward( + d_model, + d_inner, + kernel_size, + dropout_inner=dropout_relu, + dropout=dropout) + + def forward(self, + input, + memory, + mask=None, + pnca_x_attn_mask=None, + pnca_h_attn_mask=None): + output, pnca_attn_x, pnca_attn_h = self.pnca_attn( + input, memory, pnca_x_attn_mask, pnca_h_attn_mask) + if mask is not None: + output = output.masked_fill(mask.unsqueeze(-1), 0) + + output = self.pos_ffn(output, mask=mask) + if mask is not None: + output = output.masked_fill(mask.unsqueeze(-1), 0) + + return output, pnca_attn_x, pnca_attn_h + + def reset_state(self): + self.pnca_attn.reset_state() diff --git a/modelscope/models/audio/tts/models/models/sambert/fsmn.py b/modelscope/models/audio/tts/models/models/sambert/fsmn.py new file mode 100644 index 00000000..c070ef35 --- /dev/null +++ b/modelscope/models/audio/tts/models/models/sambert/fsmn.py @@ -0,0 +1,126 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +""" +FSMN Pytorch Version +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class FeedForwardNet(nn.Module): + """ A two-feed-forward-layer module """ + + def __init__(self, d_in, d_hid, d_out, kernel_size=[1, 1], dropout=0.1): + super().__init__() + + # Use Conv1D + # position-wise + self.w_1 = nn.Conv1d( + d_in, + d_hid, + kernel_size=kernel_size[0], + padding=(kernel_size[0] - 1) // 2, + ) + # position-wise + self.w_2 = nn.Conv1d( + d_hid, + d_out, + kernel_size=kernel_size[1], + padding=(kernel_size[1] - 1) // 2, + bias=False) + + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + output = x.transpose(1, 2) + output = F.relu(self.w_1(output)) + output = self.dropout(output) + output = self.w_2(output) + output = output.transpose(1, 2) + + return output + + +class MemoryBlockV2(nn.Module): + + def __init__(self, d, filter_size, shift, dropout=0.0): + super(MemoryBlockV2, self).__init__() + + left_padding = int(round((filter_size - 1) / 2)) + right_padding = int((filter_size - 1) / 2) + if shift > 0: + left_padding += shift + right_padding -= shift + + self.lp, self.rp = left_padding, right_padding + + self.conv_dw = nn.Conv1d(d, d, filter_size, 1, 0, groups=d, bias=False) + self.dropout = nn.Dropout(dropout) + + def forward(self, input, mask=None): + if mask is not None: + input = input.masked_fill(mask.unsqueeze(-1), 0) + + x = F.pad( + input, (0, 0, self.lp, self.rp, 0, 0), mode='constant', value=0.0) + output = self.conv_dw(x.contiguous().transpose( + 1, 2)).contiguous().transpose(1, 2) + output += input + output = self.dropout(output) + + if mask is not None: + output = output.masked_fill(mask.unsqueeze(-1), 0) + + return output + + +class FsmnEncoderV2(nn.Module): + + def __init__(self, + filter_size, + fsmn_num_layers, + input_dim, + num_memory_units, + ffn_inner_dim, + dropout=0.0, + shift=0): + super(FsmnEncoderV2, self).__init__() + + self.filter_size = filter_size + self.fsmn_num_layers = fsmn_num_layers + self.num_memory_units = num_memory_units + self.ffn_inner_dim = ffn_inner_dim + self.dropout = dropout + self.shift = shift + if not isinstance(shift, list): + self.shift = [shift for _ in range(self.fsmn_num_layers)] + + self.ffn_lst = nn.ModuleList() + self.ffn_lst.append( + FeedForwardNet( + input_dim, ffn_inner_dim, num_memory_units, dropout=dropout)) + for i in range(1, fsmn_num_layers): + self.ffn_lst.append( + FeedForwardNet( + num_memory_units, + ffn_inner_dim, + num_memory_units, + dropout=dropout)) + + self.memory_block_lst = nn.ModuleList() + for i in range(fsmn_num_layers): + self.memory_block_lst.append( + MemoryBlockV2(num_memory_units, filter_size, self.shift[i], + dropout)) + + def forward(self, input, mask=None): + x = F.dropout(input, self.dropout, self.training) + for (ffn, memory_block) in zip(self.ffn_lst, self.memory_block_lst): + context = ffn(x) + memory = memory_block(context, mask) + memory = F.dropout(memory, self.dropout, self.training) + if (memory.size(-1) == x.size(-1)): + memory += x + x = memory + + return x diff --git a/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py b/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py new file mode 100644 index 00000000..3837a2e8 --- /dev/null +++ b/modelscope/models/audio/tts/models/models/sambert/kantts_sambert.py @@ -0,0 +1,718 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modelscope.models.audio.tts.models.utils import get_mask_from_lengths +from .adaptors import (LengthRegulator, VarFsmnRnnNARPredictor, + VarRnnARPredictor) +from .base import FFTBlock, PNCABlock, Prenet +from .fsmn import FsmnEncoderV2 +from .positions import DurSinusoidalPositionEncoder, SinusoidalPositionEncoder + + +class SelfAttentionEncoder(nn.Module): + + def __init__(self, n_layer, d_in, d_model, n_head, d_head, d_inner, + dropout, dropout_att, dropout_relu, position_encoder): + super(SelfAttentionEncoder, self).__init__() + + self.d_in = d_in + self.d_model = d_model + self.dropout = dropout + d_in_lst = [d_in] + [d_model] * (n_layer - 1) + self.fft = nn.ModuleList([ + FFTBlock(d, d_model, n_head, d_head, d_inner, (3, 1), dropout, + dropout_att, dropout_relu) for d in d_in_lst + ]) + self.ln = nn.LayerNorm(d_model, eps=1e-6) + self.position_enc = position_encoder + + def forward(self, input, mask=None, return_attns=False): + input *= self.d_model**0.5 + if (isinstance(self.position_enc, SinusoidalPositionEncoder)): + input = self.position_enc(input) + else: + raise NotImplementedError('modelscope error: position_enc invalid') + + input = F.dropout(input, p=self.dropout, training=self.training) + + enc_slf_attn_list = [] + max_len = input.size(1) + if mask is not None: + slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) + else: + slf_attn_mask = None + + enc_output = input + for id, layer in enumerate(self.fft): + enc_output, enc_slf_attn = layer( + enc_output, mask=mask, slf_attn_mask=slf_attn_mask) + if return_attns: + enc_slf_attn_list += [enc_slf_attn] + + enc_output = self.ln(enc_output) + + return enc_output, enc_slf_attn_list + + +class HybridAttentionDecoder(nn.Module): + + def __init__(self, d_in, prenet_units, n_layer, d_model, d_mem, n_head, + d_head, d_inner, dropout, dropout_att, dropout_relu, d_out): + super(HybridAttentionDecoder, self).__init__() + + self.d_model = d_model + self.dropout = dropout + self.prenet = Prenet(d_in, prenet_units, d_model) + self.dec_in_proj = nn.Linear(d_model + d_mem, d_model) + self.pnca = nn.ModuleList([ + PNCABlock(d_model, d_mem, n_head, d_head, d_inner, (1, 1), dropout, + dropout_att, dropout_relu) for _ in range(n_layer) + ]) + self.ln = nn.LayerNorm(d_model, eps=1e-6) + self.dec_out_proj = nn.Linear(d_model, d_out) + + def reset_state(self): + for layer in self.pnca: + layer.reset_state() + + def get_pnca_attn_mask(self, + device, + max_len, + x_band_width, + h_band_width, + mask=None): + if mask is not None: + pnca_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) + else: + pnca_attn_mask = None + + range_ = torch.arange(max_len).to(device) + x_start = torch.clamp_min(range_ - x_band_width, 0)[None, None, :] + x_end = (range_ + 1)[None, None, :] + h_start = range_[None, None, :] + h_end = torch.clamp_max(range_ + h_band_width + 1, + max_len + 1)[None, None, :] + + pnca_x_attn_mask = ~((x_start <= range_[None, :, None]) + & (x_end > range_[None, :, None])).transpose(1, 2) # yapf:disable + pnca_h_attn_mask = ~((h_start <= range_[None, :, None]) + & (h_end > range_[None, :, None])).transpose(1, 2) # yapf:disable + + if pnca_attn_mask is not None: + pnca_x_attn_mask = (pnca_x_attn_mask | pnca_attn_mask) + pnca_h_attn_mask = (pnca_h_attn_mask | pnca_attn_mask) + pnca_x_attn_mask = pnca_x_attn_mask.masked_fill( + pnca_attn_mask.transpose(1, 2), False) + pnca_h_attn_mask = pnca_h_attn_mask.masked_fill( + pnca_attn_mask.transpose(1, 2), False) + + return pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask + + # must call reset_state before + def forward(self, + input, + memory, + x_band_width, + h_band_width, + mask=None, + return_attns=False): + input = self.prenet(input) + input = torch.cat([memory, input], dim=-1) + input = self.dec_in_proj(input) + + if mask is not None: + input = input.masked_fill(mask.unsqueeze(-1), 0) + + input *= self.d_model**0.5 + input = F.dropout(input, p=self.dropout, training=self.training) + + max_len = input.size(1) + pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask( + input.device, max_len, x_band_width, h_band_width, mask) + + dec_pnca_attn_x_list = [] + dec_pnca_attn_h_list = [] + dec_output = input + for id, layer in enumerate(self.pnca): + dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer( + dec_output, + memory, + mask=mask, + pnca_x_attn_mask=pnca_x_attn_mask, + pnca_h_attn_mask=pnca_h_attn_mask) + if return_attns: + dec_pnca_attn_x_list += [dec_pnca_attn_x] + dec_pnca_attn_h_list += [dec_pnca_attn_h] + + dec_output = self.ln(dec_output) + dec_output = self.dec_out_proj(dec_output) + + return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list + + # must call reset_state before when step == 0 + def infer(self, + step, + input, + memory, + x_band_width, + h_band_width, + mask=None, + return_attns=False): + max_len = memory.size(1) + + input = self.prenet(input) + input = torch.cat([memory[:, step:step + 1, :], input], dim=-1) + input = self.dec_in_proj(input) + + input *= self.d_model**0.5 + input = F.dropout(input, p=self.dropout, training=self.training) + + pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask( + input.device, max_len, x_band_width, h_band_width, mask) + + dec_pnca_attn_x_list = [] + dec_pnca_attn_h_list = [] + dec_output = input + for id, layer in enumerate(self.pnca): + if mask is not None: + mask_step = mask[:, step:step + 1] + else: + mask_step = None + dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer( + dec_output, + memory, + mask=mask_step, + pnca_x_attn_mask=pnca_x_attn_mask[:, + step:step + 1, :(step + 1)], + pnca_h_attn_mask=pnca_h_attn_mask[:, step:step + 1, :]) + if return_attns: + dec_pnca_attn_x_list += [dec_pnca_attn_x] + dec_pnca_attn_h_list += [dec_pnca_attn_h] + + dec_output = self.ln(dec_output) + dec_output = self.dec_out_proj(dec_output) + + return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list + + +class TextFftEncoder(nn.Module): + + def __init__(self, config, ling_unit_size): + super(TextFftEncoder, self).__init__() + + # linguistic unit lookup table + nb_ling_sy = ling_unit_size['sy'] + nb_ling_tone = ling_unit_size['tone'] + nb_ling_syllable_flag = ling_unit_size['syllable_flag'] + nb_ling_ws = ling_unit_size['word_segment'] + + max_len = config['am']['max_len'] + + d_emb = config['am']['embedding_dim'] + nb_layers = config['am']['encoder_num_layers'] + nb_heads = config['am']['encoder_num_heads'] + d_model = config['am']['encoder_num_units'] + d_head = d_model // nb_heads + d_inner = config['am']['encoder_ffn_inner_dim'] + dropout = config['am']['encoder_dropout'] + dropout_attn = config['am']['encoder_attention_dropout'] + dropout_relu = config['am']['encoder_relu_dropout'] + d_proj = config['am']['encoder_projection_units'] + + self.d_model = d_model + + self.sy_emb = nn.Embedding(nb_ling_sy, d_emb) + self.tone_emb = nn.Embedding(nb_ling_tone, d_emb) + self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb) + self.ws_emb = nn.Embedding(nb_ling_ws, d_emb) + + position_enc = SinusoidalPositionEncoder(max_len, d_emb) + + self.ling_enc = SelfAttentionEncoder(nb_layers, d_emb, d_model, + nb_heads, d_head, d_inner, + dropout, dropout_attn, + dropout_relu, position_enc) + + self.ling_proj = nn.Linear(d_model, d_proj, bias=False) + + def forward(self, inputs_ling, masks=None, return_attns=False): + # Parse inputs_ling_seq + inputs_sy = inputs_ling[:, :, 0] + inputs_tone = inputs_ling[:, :, 1] + inputs_syllable_flag = inputs_ling[:, :, 2] + inputs_ws = inputs_ling[:, :, 3] + + # Lookup table + sy_embedding = self.sy_emb(inputs_sy) + tone_embedding = self.tone_emb(inputs_tone) + syllable_flag_embedding = self.syllable_flag_emb(inputs_syllable_flag) + ws_embedding = self.ws_emb(inputs_ws) + + ling_embedding = sy_embedding + tone_embedding + syllable_flag_embedding + ws_embedding + + enc_output, enc_slf_attn_list = self.ling_enc(ling_embedding, masks, + return_attns) + + enc_output = self.ling_proj(enc_output) + + return enc_output, enc_slf_attn_list + + +class VarianceAdaptor(nn.Module): + + def __init__(self, config): + super(VarianceAdaptor, self).__init__() + + input_dim = config['am']['encoder_projection_units'] + config['am'][ + 'emotion_units'] + config['am']['speaker_units'] + filter_size = config['am']['predictor_filter_size'] + fsmn_num_layers = config['am']['predictor_fsmn_num_layers'] + num_memory_units = config['am']['predictor_num_memory_units'] + ffn_inner_dim = config['am']['predictor_ffn_inner_dim'] + dropout = config['am']['predictor_dropout'] + shift = config['am']['predictor_shift'] + lstm_units = config['am']['predictor_lstm_units'] + + dur_pred_prenet_units = config['am']['dur_pred_prenet_units'] + dur_pred_lstm_units = config['am']['dur_pred_lstm_units'] + + self.pitch_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size, + fsmn_num_layers, + num_memory_units, + ffn_inner_dim, dropout, + shift, lstm_units) + self.energy_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size, + fsmn_num_layers, + num_memory_units, + ffn_inner_dim, dropout, + shift, lstm_units) + self.duration_predictor = VarRnnARPredictor(input_dim, + dur_pred_prenet_units, + dur_pred_lstm_units) + + self.length_regulator = LengthRegulator( + config['am']['outputs_per_step']) + self.dur_position_encoder = DurSinusoidalPositionEncoder( + config['am']['encoder_projection_units'], + config['am']['outputs_per_step']) + + self.pitch_emb = nn.Conv1d( + 1, + config['am']['encoder_projection_units'], + kernel_size=9, + padding=4) + self.energy_emb = nn.Conv1d( + 1, + config['am']['encoder_projection_units'], + kernel_size=9, + padding=4) + + def forward(self, + inputs_text_embedding, + inputs_emo_embedding, + inputs_spk_embedding, + masks=None, + output_masks=None, + duration_targets=None, + pitch_targets=None, + energy_targets=None): + + batch_size = inputs_text_embedding.size(0) + + variance_predictor_inputs = torch.cat([ + inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding + ], dim=-1) # yapf:disable + + pitch_predictions = self.pitch_predictor(variance_predictor_inputs, + masks) + energy_predictions = self.energy_predictor(variance_predictor_inputs, + masks) + + if pitch_targets is not None: + pitch_embeddings = self.pitch_emb( + pitch_targets.unsqueeze(1)).transpose(1, 2) + else: + pitch_embeddings = self.pitch_emb( + pitch_predictions.unsqueeze(1)).transpose(1, 2) + + if energy_targets is not None: + energy_embeddings = self.energy_emb( + energy_targets.unsqueeze(1)).transpose(1, 2) + else: + energy_embeddings = self.energy_emb( + energy_predictions.unsqueeze(1)).transpose(1, 2) + + inputs_text_embedding_aug = inputs_text_embedding + pitch_embeddings + energy_embeddings + duration_predictor_cond = torch.cat([ + inputs_text_embedding_aug, inputs_spk_embedding, + inputs_emo_embedding + ], dim=-1) # yapf:disable + if duration_targets is not None: + duration_predictor_go_frame = torch.zeros(batch_size, 1).to( + inputs_text_embedding.device) + duration_predictor_input = torch.cat([ + duration_predictor_go_frame, duration_targets[:, :-1].float() + ], dim=-1) # yapf:disable + duration_predictor_input = torch.log(duration_predictor_input + 1) + log_duration_predictions, _ = self.duration_predictor( + duration_predictor_input.unsqueeze(-1), + duration_predictor_cond, + masks=masks) + duration_predictions = torch.exp(log_duration_predictions) - 1 + else: + log_duration_predictions = self.duration_predictor.infer( + duration_predictor_cond, masks=masks) + duration_predictions = torch.exp(log_duration_predictions) - 1 + + if duration_targets is not None: + LR_text_outputs, LR_length_rounded = self.length_regulator( + inputs_text_embedding_aug, + duration_targets, + masks=output_masks) + LR_position_embeddings = self.dur_position_encoder( + duration_targets, masks=output_masks) + LR_emo_outputs, _ = self.length_regulator( + inputs_emo_embedding, duration_targets, masks=output_masks) + LR_spk_outputs, _ = self.length_regulator( + inputs_spk_embedding, duration_targets, masks=output_masks) + + else: + LR_text_outputs, LR_length_rounded = self.length_regulator( + inputs_text_embedding_aug, + duration_predictions, + masks=output_masks) + LR_position_embeddings = self.dur_position_encoder( + duration_predictions, masks=output_masks) + LR_emo_outputs, _ = self.length_regulator( + inputs_emo_embedding, duration_predictions, masks=output_masks) + LR_spk_outputs, _ = self.length_regulator( + inputs_spk_embedding, duration_predictions, masks=output_masks) + + LR_text_outputs = LR_text_outputs + LR_position_embeddings + + return (LR_text_outputs, LR_emo_outputs, LR_spk_outputs, + LR_length_rounded, log_duration_predictions, pitch_predictions, + energy_predictions) + + +class MelPNCADecoder(nn.Module): + + def __init__(self, config): + super(MelPNCADecoder, self).__init__() + + prenet_units = config['am']['decoder_prenet_units'] + nb_layers = config['am']['decoder_num_layers'] + nb_heads = config['am']['decoder_num_heads'] + d_model = config['am']['decoder_num_units'] + d_head = d_model // nb_heads + d_inner = config['am']['decoder_ffn_inner_dim'] + dropout = config['am']['decoder_dropout'] + dropout_attn = config['am']['decoder_attention_dropout'] + dropout_relu = config['am']['decoder_relu_dropout'] + outputs_per_step = config['am']['outputs_per_step'] + + d_mem = config['am'][ + 'encoder_projection_units'] * outputs_per_step + config['am'][ + 'emotion_units'] + config['am']['speaker_units'] + d_mel = config['am']['num_mels'] + + self.d_mel = d_mel + self.r = outputs_per_step + self.nb_layers = nb_layers + + self.mel_dec = HybridAttentionDecoder(d_mel, prenet_units, nb_layers, + d_model, d_mem, nb_heads, d_head, + d_inner, dropout, dropout_attn, + dropout_relu, + d_mel * outputs_per_step) + + def forward(self, + memory, + x_band_width, + h_band_width, + target=None, + mask=None, + return_attns=False): + batch_size = memory.size(0) + go_frame = torch.zeros((batch_size, 1, self.d_mel)).to(memory.device) + + if target is not None: + self.mel_dec.reset_state() + input = target[:, self.r - 1::self.r, :] + input = torch.cat([go_frame, input], dim=1)[:, :-1, :] + dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list = self.mel_dec( + input, + memory, + x_band_width, + h_band_width, + mask=mask, + return_attns=return_attns) + + else: + dec_output = [] + dec_pnca_attn_x_list = [[] for _ in range(self.nb_layers)] + dec_pnca_attn_h_list = [[] for _ in range(self.nb_layers)] + self.mel_dec.reset_state() + input = go_frame + for step in range(memory.size(1)): + dec_output_step, dec_pnca_attn_x_step, dec_pnca_attn_h_step = self.mel_dec.infer( + step, + input, + memory, + x_band_width, + h_band_width, + mask=mask, + return_attns=return_attns) + input = dec_output_step[:, :, -self.d_mel:] + + dec_output.append(dec_output_step) + for layer_id, (pnca_x_attn, pnca_h_attn) in enumerate( + zip(dec_pnca_attn_x_step, dec_pnca_attn_h_step)): + left = memory.size(1) - pnca_x_attn.size(-1) + if (left > 0): + padding = torch.zeros( + (pnca_x_attn.size(0), 1, left)).to(pnca_x_attn) + pnca_x_attn = torch.cat([pnca_x_attn, padding], dim=-1) + dec_pnca_attn_x_list[layer_id].append(pnca_x_attn) + dec_pnca_attn_h_list[layer_id].append(pnca_h_attn) + + dec_output = torch.cat(dec_output, dim=1) + for layer_id in range(self.nb_layers): + dec_pnca_attn_x_list[layer_id] = torch.cat( + dec_pnca_attn_x_list[layer_id], dim=1) + dec_pnca_attn_h_list[layer_id] = torch.cat( + dec_pnca_attn_h_list[layer_id], dim=1) + + return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list + + +class PostNet(nn.Module): + + def __init__(self, config): + super(PostNet, self).__init__() + + self.filter_size = config['am']['postnet_filter_size'] + self.fsmn_num_layers = config['am']['postnet_fsmn_num_layers'] + self.num_memory_units = config['am']['postnet_num_memory_units'] + self.ffn_inner_dim = config['am']['postnet_ffn_inner_dim'] + self.dropout = config['am']['postnet_dropout'] + self.shift = config['am']['postnet_shift'] + self.lstm_units = config['am']['postnet_lstm_units'] + self.num_mels = config['am']['num_mels'] + + self.fsmn = FsmnEncoderV2(self.filter_size, self.fsmn_num_layers, + self.num_mels, self.num_memory_units, + self.ffn_inner_dim, self.dropout, self.shift) + self.lstm = nn.LSTM( + self.num_memory_units, + self.lstm_units, + num_layers=1, + batch_first=True) + self.fc = nn.Linear(self.lstm_units, self.num_mels) + + def forward(self, x, mask=None): + postnet_fsmn_output = self.fsmn(x, mask) + # The input can also be a packed variable length sequence, + # here we just omit it for simpliciy due to the mask and uni-directional lstm. + postnet_lstm_output, _ = self.lstm(postnet_fsmn_output) + mel_residual_output = self.fc(postnet_lstm_output) + + return mel_residual_output + + +def mel_recon_loss_fn(output_lengths, + mel_targets, + dec_outputs, + postnet_outputs=None): + mae_loss = nn.L1Loss(reduction='none') + + output_masks = get_mask_from_lengths( + output_lengths, max_len=mel_targets.size(1)) + output_masks = ~output_masks + valid_outputs = output_masks.sum() + + mel_loss_ = torch.sum( + mae_loss(mel_targets, dec_outputs) * output_masks.unsqueeze(-1)) / ( + valid_outputs * mel_targets.size(-1)) + + if postnet_outputs is not None: + mel_loss = torch.sum( + mae_loss(mel_targets, postnet_outputs) + * output_masks.unsqueeze(-1)) / ( + valid_outputs * mel_targets.size(-1)) + else: + mel_loss = 0.0 + + return mel_loss_, mel_loss + + +def prosody_recon_loss_fn(input_lengths, duration_targets, pitch_targets, + energy_targets, log_duration_predictions, + pitch_predictions, energy_predictions): + mae_loss = nn.L1Loss(reduction='none') + + input_masks = get_mask_from_lengths( + input_lengths, max_len=duration_targets.size(1)) + input_masks = ~input_masks + valid_inputs = input_masks.sum() + + dur_loss = torch.sum( + mae_loss( + torch.log(duration_targets.float() + 1), log_duration_predictions) + * input_masks) / valid_inputs + pitch_loss = torch.sum( + mae_loss(pitch_targets, pitch_predictions) + * input_masks) / valid_inputs + energy_loss = torch.sum( + mae_loss(energy_targets, energy_predictions) + * input_masks) / valid_inputs + + return dur_loss, pitch_loss, energy_loss + + +class KanTtsSAMBERT(nn.Module): + + def __init__(self, config, ling_unit_size): + super(KanTtsSAMBERT, self).__init__() + + self.text_encoder = TextFftEncoder(config, ling_unit_size) + self.spk_tokenizer = nn.Embedding(ling_unit_size['speaker'], + config['am']['speaker_units']) + self.emo_tokenizer = nn.Embedding(ling_unit_size['emotion'], + config['am']['emotion_units']) + self.variance_adaptor = VarianceAdaptor(config) + self.mel_decoder = MelPNCADecoder(config) + self.mel_postnet = PostNet(config) + + def get_lfr_mask_from_lengths(self, lengths, max_len): + batch_size = lengths.size(0) + # padding according to the outputs_per_step + padded_lr_lengths = torch.zeros_like(lengths) + for i in range(batch_size): + len_item = int(lengths[i].item()) + padding = self.mel_decoder.r - len_item % self.mel_decoder.r + if (padding < self.mel_decoder.r): + padded_lr_lengths[i] = (len_item + + padding) // self.mel_decoder.r + else: + padded_lr_lengths[i] = len_item // self.mel_decoder.r + + return get_mask_from_lengths( + padded_lr_lengths, max_len=max_len // self.mel_decoder.r) + + def forward(self, + inputs_ling, + inputs_emotion, + inputs_speaker, + input_lengths, + output_lengths=None, + mel_targets=None, + duration_targets=None, + pitch_targets=None, + energy_targets=None): + + batch_size = inputs_ling.size(0) + + input_masks = get_mask_from_lengths( + input_lengths, max_len=inputs_ling.size(1)) + + text_hid, enc_sla_attn_lst = self.text_encoder( + inputs_ling, input_masks, return_attns=True) + + emo_hid = self.emo_tokenizer(inputs_emotion) + spk_hid = self.spk_tokenizer(inputs_speaker) + + if output_lengths is not None: + output_masks = get_mask_from_lengths( + output_lengths, max_len=mel_targets.size(1)) + else: + output_masks = None + + (LR_text_outputs, LR_emo_outputs, LR_spk_outputs, LR_length_rounded, + log_duration_predictions, pitch_predictions, + energy_predictions) = self.variance_adaptor( + text_hid, + emo_hid, + spk_hid, + masks=input_masks, + output_masks=output_masks, + duration_targets=duration_targets, + pitch_targets=pitch_targets, + energy_targets=energy_targets) + + if output_lengths is not None: + lfr_masks = self.get_lfr_mask_from_lengths( + output_lengths, max_len=LR_text_outputs.size(1)) + else: + output_masks = get_mask_from_lengths( + LR_length_rounded, max_len=LR_text_outputs.size(1)) + lfr_masks = None + + # LFR with the factor of outputs_per_step + LFR_text_inputs = LR_text_outputs.contiguous().view( + batch_size, -1, self.mel_decoder.r * text_hid.shape[-1]) + LFR_emo_inputs = LR_emo_outputs.contiguous().view( + batch_size, -1, + self.mel_decoder.r * emo_hid.shape[-1])[:, :, :emo_hid.shape[-1]] + LFR_spk_inputs = LR_spk_outputs.contiguous().view( + batch_size, -1, + self.mel_decoder.r * spk_hid.shape[-1])[:, :, :spk_hid.shape[-1]] + + memory = torch.cat([LFR_text_inputs, LFR_spk_inputs, LFR_emo_inputs], + dim=-1) + + if duration_targets is not None: + x_band_width = int( + duration_targets.float().masked_fill(input_masks, 0).max() + / self.mel_decoder.r + 0.5) + h_band_width = x_band_width + else: + x_band_width = int((torch.exp(log_duration_predictions) - 1).max() + / self.mel_decoder.r + 0.5) + h_band_width = x_band_width + + dec_outputs, pnca_x_attn_lst, pnca_h_attn_lst = self.mel_decoder( + memory, + x_band_width, + h_band_width, + target=mel_targets, + mask=lfr_masks, + return_attns=True) + + # De-LFR with the factor of outputs_per_step + dec_outputs = dec_outputs.contiguous().view(batch_size, -1, + self.mel_decoder.d_mel) + + if output_masks is not None: + dec_outputs = dec_outputs.masked_fill( + output_masks.unsqueeze(-1), 0) + + postnet_outputs = self.mel_postnet(dec_outputs, + output_masks) + dec_outputs + if output_masks is not None: + postnet_outputs = postnet_outputs.masked_fill( + output_masks.unsqueeze(-1), 0) + + res = { + 'x_band_width': x_band_width, + 'h_band_width': h_band_width, + 'enc_slf_attn_lst': enc_sla_attn_lst, + 'pnca_x_attn_lst': pnca_x_attn_lst, + 'pnca_h_attn_lst': pnca_h_attn_lst, + 'dec_outputs': dec_outputs, + 'postnet_outputs': postnet_outputs, + 'LR_length_rounded': LR_length_rounded, + 'log_duration_predictions': log_duration_predictions, + 'pitch_predictions': pitch_predictions, + 'energy_predictions': energy_predictions + } + + res['LR_text_outputs'] = LR_text_outputs + res['LR_emo_outputs'] = LR_emo_outputs + res['LR_spk_outputs'] = LR_spk_outputs + + return res diff --git a/modelscope/models/audio/tts/models/models/sambert/positions.py b/modelscope/models/audio/tts/models/models/sambert/positions.py new file mode 100644 index 00000000..9d1e375d --- /dev/null +++ b/modelscope/models/audio/tts/models/models/sambert/positions.py @@ -0,0 +1,101 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SinusoidalPositionEncoder(nn.Module): + + def __init__(self, max_len, depth): + super(SinusoidalPositionEncoder, self).__init__() + + self.max_len = max_len + self.depth = depth + self.position_enc = nn.Parameter( + self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0), + requires_grad=False) + + def forward(self, input): + bz_in, len_in, _ = input.size() + if len_in > self.max_len: + self.max_len = len_in + self.position_enc.data = self.get_sinusoid_encoding_table( + self.max_len, self.depth).unsqueeze(0).to(input.device) + + output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1) + + return output + + @staticmethod + def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): + """ Sinusoid position encoding table """ + + def cal_angle(position, hid_idx): + return position / np.power(10000, hid_idx / float(d_hid / 2 - 1)) + + def get_posi_angle_vec(position): + return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)] + + scaled_time_table = np.array( + [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)]) + + sinusoid_table = np.zeros((n_position, d_hid)) + sinusoid_table[:, :d_hid // 2] = np.sin(scaled_time_table) + sinusoid_table[:, d_hid // 2:] = np.cos(scaled_time_table) + + if padding_idx is not None: + # zero vector for padding dimension + sinusoid_table[padding_idx] = 0.0 + + return torch.FloatTensor(sinusoid_table) + + +class DurSinusoidalPositionEncoder(nn.Module): + + def __init__(self, depth, outputs_per_step): + super(DurSinusoidalPositionEncoder, self).__init__() + + self.depth = depth + self.outputs_per_step = outputs_per_step + + inv_timescales = [ + np.power(10000, 2 * (hid_idx // 2) / depth) + for hid_idx in range(depth) + ] + self.inv_timescales = nn.Parameter( + torch.FloatTensor(inv_timescales), requires_grad=False) + + def forward(self, durations, masks=None): + reps = (durations + 0.5).long() + output_lens = reps.sum(dim=1) + max_len = output_lens.max() + reps_cumsum = torch.cumsum( + F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :] + range_ = torch.arange(max_len).to(durations.device)[None, :, None] + mult = ((reps_cumsum[:, :, :-1] <= range_) + & (reps_cumsum[:, :, 1:] > range_)) # yapf:disable + mult = mult.float() + offsets = torch.matmul(mult, + reps_cumsum[:, + 0, :-1].unsqueeze(-1)).squeeze(-1) + dur_pos = range_[:, :, 0] - offsets + 1 + + if masks is not None: + assert masks.size(1) == dur_pos.size(1) + dur_pos = dur_pos.masked_fill(masks, 0.0) + + seq_len = dur_pos.size(1) + padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step + if (padding < self.outputs_per_step): + dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0) + + position_embedding = dur_pos[:, :, None] / self.inv_timescales[None, + None, :] + position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :, + 0::2]) + position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :, + 1::2]) + + return position_embedding diff --git a/modelscope/models/audio/tts/models/position.py b/modelscope/models/audio/tts/models/position.py deleted file mode 100755 index bca658dd..00000000 --- a/modelscope/models/audio/tts/models/position.py +++ /dev/null @@ -1,174 +0,0 @@ -"""Define position encoder classes.""" - -import abc -import math - -import tensorflow as tf - -from .reducer import SumReducer - - -class PositionEncoder(tf.keras.layers.Layer): - """Base class for position encoders.""" - - def __init__(self, reducer=None, **kwargs): - """Initializes the position encoder. - Args: - reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position - encodings. Defaults to :class:`opennmt.layers.SumReducer`. - **kwargs: Additional layer keyword arguments. - """ - super(PositionEncoder, self).__init__(**kwargs) - if reducer is None: - reducer = SumReducer(dtype=kwargs.get('dtype')) - self.reducer = reducer - - def call(self, inputs, position=None): # pylint: disable=arguments-differ - """Add position encodings to :obj:`inputs`. - Args: - inputs: The inputs to encode. - position: The single position to encode, to use when this layer is called - step by step. - Returns: - A ``tf.Tensor`` whose shape depends on the configured ``reducer``. - """ - batch_size = tf.shape(inputs)[0] - timesteps = tf.shape(inputs)[1] - input_dim = inputs.shape[-1].value - positions = tf.range(timesteps) + 1 if position is None else [position] - position_encoding = self._encode([positions], input_dim) - position_encoding = tf.tile(position_encoding, [batch_size, 1, 1]) - return self.reducer([inputs, position_encoding]) - - @abc.abstractmethod - def _encode(self, positions, depth): - """Creates position encodings. - Args: - positions: The positions to encode of shape :math:`[B, ...]`. - depth: The encoding depth :math:`D`. - Returns: - A ``tf.Tensor`` of shape :math:`[B, ..., D]`. - """ - raise NotImplementedError() - - -class PositionEmbedder(PositionEncoder): - """Encodes position with a lookup table.""" - - def __init__(self, maximum_position=128, reducer=None, **kwargs): - """Initializes the position encoder. - Args: - maximum_position: The maximum position to embed. Positions greater - than this value will be set to :obj:`maximum_position`. - reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position - encodings. Defaults to :class:`opennmt.layers.SumReducer`. - **kwargs: Additional layer keyword arguments. - """ - super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs) - self.maximum_position = maximum_position - self.embedding = None - - def build(self, input_shape): - shape = [self.maximum_position + 1, input_shape[-1]] - self.embedding = self.add_weight('position_embedding', shape) - super(PositionEmbedder, self).build(input_shape) - - def _encode(self, positions, depth): - positions = tf.minimum(positions, self.maximum_position) - return tf.nn.embedding_lookup(self.embedding, positions) - - -class SinusoidalPositionEncoder(PositionEncoder): - """Encodes positions with sine waves as described in - https://arxiv.org/abs/1706.03762. - """ - - def _encode(self, positions, depth): - if depth % 2 != 0: - raise ValueError( - 'SinusoidalPositionEncoder expects the depth to be divisble ' - 'by 2 but got %d' % depth) - - batch_size = tf.shape(positions)[0] - positions = tf.cast(positions, tf.float32) - - log_timescale_increment = math.log(10000) / (depth / 2 - 1) - inv_timescales = tf.exp( - tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment) - inv_timescales = tf.reshape( - tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2]) - scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims( - inv_timescales, 1) - encoding = tf.concat( - [tf.sin(scaled_time), tf.cos(scaled_time)], axis=2) - return tf.cast(encoding, self.dtype) - - -class SinusodalPositionalEncoding(tf.keras.layers.Layer): - - def __init__(self, name='SinusodalPositionalEncoding'): - super(SinusodalPositionalEncoding, self).__init__(name=name) - - @staticmethod - def positional_encoding(len, dim, step=1.): - """ - :param len: int scalar - :param dim: int scalar - :param step: - :return: position embedding - """ - pos_mat = tf.tile( - tf.expand_dims( - tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32) - * step, - axis=-1), [1, dim]) - dim_mat = tf.tile( - tf.expand_dims( - tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), - axis=0), [len, 1]) - dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) - pos_encoding = tf.where( # [time, dims] - tf.math.equal(tf.math.mod(dim_mat_int, 2), 0), - x=tf.math.sin( - pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), - y=tf.math.cos(pos_mat - / tf.pow(10000., - (dim_mat - 1) / tf.cast(dim, tf.float32)))) - return pos_encoding - - -class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer): - - def __init__(self, name='BatchSinusodalPositionalEncoding'): - super(BatchSinusodalPositionalEncoding, self).__init__(name=name) - - @staticmethod - def positional_encoding(batch_size, len, dim, pos_mat, step=1.): - """ - :param len: int scalar - :param dim: int scalar - :param step: - :param pos_mat: [B, len] = [len, 1] * dim - :return: position embedding - """ - pos_mat = tf.tile( - tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1), - [1, 1, dim]) # [B, len, dim] - - dim_mat = tf.tile( - tf.expand_dims( - tf.expand_dims( - tf.range( - 0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), - axis=0), - axis=0), [batch_size, len, 1]) # [B, len, dim] - - dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) - pos_encoding = tf.where( # [B, time, dims] - tf.math.equal(tf.mod(dim_mat_int, 2), 0), - x=tf.math.sin( - pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), - y=tf.math.cos(pos_mat - / tf.pow(10000., - (dim_mat - 1) / tf.cast(dim, tf.float32)))) - return pos_encoding diff --git a/modelscope/models/audio/tts/models/reducer.py b/modelscope/models/audio/tts/models/reducer.py deleted file mode 100755 index a4c9ae17..00000000 --- a/modelscope/models/audio/tts/models/reducer.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Define reducers: objects that merge inputs.""" - -import abc -import functools - -import tensorflow as tf - - -def pad_in_time(x, padding_length): - """Helper function to pad a tensor in the time dimension and retain the static depth dimension.""" - return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) - - -def align_in_time(x, length): - """Aligns the time dimension of :obj:`x` with :obj:`length`.""" - time_dim = tf.shape(x)[1] - return tf.cond( - tf.less(time_dim, length), - true_fn=lambda: pad_in_time(x, length - time_dim), - false_fn=lambda: x[:, :length]) - - -def pad_with_identity(x, - sequence_length, - max_sequence_length, - identity_values=0, - maxlen=None): - """Pads a tensor with identity values up to :obj:`max_sequence_length`. - Args: - x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``. - sequence_length: The true sequence length of :obj:`x`. - max_sequence_length: The sequence length up to which the tensor must contain - :obj:`identity values`. - identity_values: The identity value. - maxlen: Size of the output time dimension. Default is the maximum value in - obj:`max_sequence_length`. - Returns: - A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``. - """ - if maxlen is None: - maxlen = tf.reduce_max(max_sequence_length) - - mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype) - mask = tf.expand_dims(mask, axis=-1) - mask_combined = tf.sequence_mask( - max_sequence_length, maxlen=maxlen, dtype=x.dtype) - mask_combined = tf.expand_dims(mask_combined, axis=-1) - - identity_mask = mask_combined * (1.0 - mask) - - x = pad_in_time(x, maxlen - tf.shape(x)[1]) - x = x * mask + (identity_mask * identity_values) - - return x - - -def pad_n_with_identity(inputs, sequence_lengths, identity_values=0): - """Pads each input tensors with identity values up to - ``max(sequence_lengths)`` for each batch. - Args: - inputs: A list of ``tf.Tensor``. - sequence_lengths: A list of sequence length. - identity_values: The identity value. - Returns: - A tuple ``(padded, max_sequence_length)`` which are respectively a list of - ``tf.Tensor`` where each tensor are padded with identity and the combined - sequence length. - """ - max_sequence_length = tf.reduce_max(sequence_lengths, axis=0) - maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs]) - padded = [ - pad_with_identity( - x, - length, - max_sequence_length, - identity_values=identity_values, - maxlen=maxlen) for x, length in zip(inputs, sequence_lengths) - ] - return padded, max_sequence_length - - -class Reducer(tf.keras.layers.Layer): - """Base class for reducers.""" - - def zip_and_reduce(self, x, y): - """Zips the :obj:`x` with :obj:`y` structures together and reduces all - elements. If the structures are nested, they will be flattened first. - Args: - x: The first structure. - y: The second structure. - Returns: - The same structure as :obj:`x` and :obj:`y` where each element from - :obj:`x` is reduced with the correspond element from :obj:`y`. - Raises: - ValueError: if the two structures are not the same. - """ - tf.nest.assert_same_structure(x, y) - x_flat = tf.nest.flatten(x) - y_flat = tf.nest.flatten(y) - reduced = list(map(self, zip(x_flat, y_flat))) - return tf.nest.pack_sequence_as(x, reduced) - - def call(self, inputs, sequence_length=None): # pylint: disable=arguments-differ - """Reduces all input elements. - Args: - inputs: A list of ``tf.Tensor``. - sequence_length: The length of each input, if reducing sequences. - Returns: - If :obj:`sequence_length` is set, a tuple - ``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor`` - only. - """ - if sequence_length is None: - return self.reduce(inputs) - else: - return self.reduce_sequence( - inputs, sequence_lengths=sequence_length) - - @abc.abstractmethod - def reduce(self, inputs): - """See :meth:`opennmt.layers.Reducer.__call__`.""" - raise NotImplementedError() - - @abc.abstractmethod - def reduce_sequence(self, inputs, sequence_lengths): - """See :meth:`opennmt.layers.Reducer.__call__`.""" - raise NotImplementedError() - - -class SumReducer(Reducer): - """A reducer that sums the inputs.""" - - def reduce(self, inputs): - if len(inputs) == 1: - return inputs[0] - if len(inputs) == 2: - return inputs[0] + inputs[1] - return tf.add_n(inputs) - - def reduce_sequence(self, inputs, sequence_lengths): - padded, combined_length = pad_n_with_identity( - inputs, sequence_lengths, identity_values=0) - return self.reduce(padded), combined_length - - -class MultiplyReducer(Reducer): - """A reducer that multiplies the inputs.""" - - def reduce(self, inputs): - return functools.reduce(lambda a, x: a * x, inputs) - - def reduce_sequence(self, inputs, sequence_lengths): - padded, combined_length = pad_n_with_identity( - inputs, sequence_lengths, identity_values=1) - return self.reduce(padded), combined_length diff --git a/modelscope/models/audio/tts/models/rnn_wrappers.py b/modelscope/models/audio/tts/models/rnn_wrappers.py deleted file mode 100755 index 6c487bab..00000000 --- a/modelscope/models/audio/tts/models/rnn_wrappers.py +++ /dev/null @@ -1,237 +0,0 @@ -import tensorflow as tf -from tensorflow.python.ops import rnn_cell_impl - -from .am_models import prenet - - -class VarPredictorCell(tf.contrib.rnn.RNNCell): - """Wrapper wrapper knock knock.""" - - def __init__(self, var_predictor_cell, is_training, dim, prenet_units): - super(VarPredictorCell, self).__init__() - self._var_predictor_cell = var_predictor_cell - self._is_training = is_training - self._dim = dim - self._prenet_units = prenet_units - - @property - def state_size(self): - return tuple([self.output_size, self._var_predictor_cell.state_size]) - - @property - def output_size(self): - return self._dim - - def zero_state(self, batch_size, dtype): - return tuple([ - rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, - dtype), - self._var_predictor_cell.zero_state(batch_size, dtype) - ]) - - def call(self, inputs, state): - """Run the Tacotron2 super decoder cell.""" - super_cell_out, decoder_state = state - - # split - prenet_input = inputs[:, 0:self._dim] - encoder_output = inputs[:, self._dim:] - - # prenet and concat - prenet_output = prenet( - prenet_input, - self._prenet_units, - self._is_training, - scope='var_prenet') - decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) - - # decoder LSTM/GRU - new_super_cell_out, new_decoder_state = self._var_predictor_cell( - decoder_input, decoder_state) - - # projection - new_super_cell_out = tf.layers.dense( - new_super_cell_out, units=self._dim) - - new_states = tuple([new_super_cell_out, new_decoder_state]) - - return new_super_cell_out, new_states - - -class DurPredictorCell(tf.contrib.rnn.RNNCell): - """Wrapper wrapper knock knock.""" - - def __init__(self, var_predictor_cell, is_training, dim, prenet_units): - super(DurPredictorCell, self).__init__() - self._var_predictor_cell = var_predictor_cell - self._is_training = is_training - self._dim = dim - self._prenet_units = prenet_units - - @property - def state_size(self): - return tuple([self.output_size, self._var_predictor_cell.state_size]) - - @property - def output_size(self): - return self._dim - - def zero_state(self, batch_size, dtype): - return tuple([ - rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, - dtype), - self._var_predictor_cell.zero_state(batch_size, dtype) - ]) - - def call(self, inputs, state): - """Run the Tacotron2 super decoder cell.""" - super_cell_out, decoder_state = state - - # split - prenet_input = inputs[:, 0:self._dim] - encoder_output = inputs[:, self._dim:] - - # prenet and concat - prenet_output = prenet( - prenet_input, - self._prenet_units, - self._is_training, - scope='dur_prenet') - decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) - - # decoder LSTM/GRU - new_super_cell_out, new_decoder_state = self._var_predictor_cell( - decoder_input, decoder_state) - - # projection - new_super_cell_out = tf.layers.dense( - new_super_cell_out, units=self._dim) - new_super_cell_out = tf.nn.relu(new_super_cell_out) - # new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1) - - new_states = tuple([new_super_cell_out, new_decoder_state]) - - return new_super_cell_out, new_states - - -class DurPredictorCECell(tf.contrib.rnn.RNNCell): - """Wrapper wrapper knock knock.""" - - def __init__(self, var_predictor_cell, is_training, dim, prenet_units, - max_dur, dur_embedding_dim): - super(DurPredictorCECell, self).__init__() - self._var_predictor_cell = var_predictor_cell - self._is_training = is_training - self._dim = dim - self._prenet_units = prenet_units - self._max_dur = max_dur - self._dur_embedding_dim = dur_embedding_dim - - @property - def state_size(self): - return tuple([self.output_size, self._var_predictor_cell.state_size]) - - @property - def output_size(self): - return self._max_dur - - def zero_state(self, batch_size, dtype): - return tuple([ - rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, - dtype), - self._var_predictor_cell.zero_state(batch_size, dtype) - ]) - - def call(self, inputs, state): - """Run the Tacotron2 super decoder cell.""" - super_cell_out, decoder_state = state - - # split - prenet_input = tf.squeeze( - tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1) # [N] - prenet_input = tf.one_hot( - prenet_input, self._max_dur, on_value=1.0, off_value=0.0, - axis=-1) # [N, 120] - prenet_input = tf.layers.dense( - prenet_input, units=self._dur_embedding_dim) - encoder_output = inputs[:, self._dim:] - - # prenet and concat - prenet_output = prenet( - prenet_input, - self._prenet_units, - self._is_training, - scope='dur_prenet') - decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) - - # decoder LSTM/GRU - new_super_cell_out, new_decoder_state = self._var_predictor_cell( - decoder_input, decoder_state) - - # projection - new_super_cell_out = tf.layers.dense( - new_super_cell_out, units=self._max_dur) # [N, 120] - new_super_cell_out = tf.nn.softmax(new_super_cell_out) # [N, 120] - - new_states = tuple([new_super_cell_out, new_decoder_state]) - - return new_super_cell_out, new_states - - -class VarPredictorCell2(tf.contrib.rnn.RNNCell): - """Wrapper wrapper knock knock.""" - - def __init__(self, var_predictor_cell, is_training, dim, prenet_units): - super(VarPredictorCell2, self).__init__() - self._var_predictor_cell = var_predictor_cell - self._is_training = is_training - self._dim = dim - self._prenet_units = prenet_units - - @property - def state_size(self): - return tuple([self.output_size, self._var_predictor_cell.state_size]) - - @property - def output_size(self): - return self._dim - - def zero_state(self, batch_size, dtype): - return tuple([ - rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, - dtype), - self._var_predictor_cell.zero_state(batch_size, dtype) - ]) - - def call(self, inputs, state): - '''Run the Tacotron2 super decoder cell.''' - super_cell_out, decoder_state = state - - # split - prenet_input = inputs[:, 0:self._dim] - encoder_output = inputs[:, self._dim:] - - # prenet and concat - prenet_output = prenet( - prenet_input, - self._prenet_units, - self._is_training, - scope='var_prenet') - decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) - - # decoder LSTM/GRU - new_super_cell_out, new_decoder_state = self._var_predictor_cell( - decoder_input, decoder_state) - - # projection - new_super_cell_out = tf.layers.dense( - new_super_cell_out, units=self._dim) - - # split and relu - new_super_cell_out = tf.concat([ - tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:] - ], axis=-1) # yapf:disable - - new_states = tuple([new_super_cell_out, new_decoder_state]) - - return new_super_cell_out, new_states diff --git a/modelscope/models/audio/tts/models/robutrans.py b/modelscope/models/audio/tts/models/robutrans.py deleted file mode 100755 index ab9fdfcc..00000000 --- a/modelscope/models/audio/tts/models/robutrans.py +++ /dev/null @@ -1,760 +0,0 @@ -import tensorflow as tf -from tensorflow.python.ops.ragged.ragged_util import repeat - -from .fsmn_encoder import FsmnEncoderV2 -from .position import BatchSinusodalPositionalEncoding -from .self_attention_decoder import SelfAttentionDecoder -from .self_attention_encoder import SelfAttentionEncoder - - -class RobuTrans(): - - def __init__(self, hparams): - self._hparams = hparams - - def initialize(self, - inputs, - inputs_emotion, - inputs_speaker, - input_lengths, - output_lengths=None, - mel_targets=None, - durations=None, - pitch_contours=None, - uv_masks=None, - pitch_scales=None, - duration_scales=None, - energy_contours=None, - energy_scales=None): - """Initializes the model for inference. - - Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. - - Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of - steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths - of each sequence in inputs. - output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths - of each sequence in outputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number - of steps in the output time series, M is num_mels, and values are entries in the mel - spectrogram. Only needed for training. - """ - from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell - from tensorflow.contrib.seq2seq import BasicDecoder - - with tf.variable_scope('inference') as _: - is_training = mel_targets is not None - batch_size = tf.shape(inputs)[0] - hp = self._hparams - - input_mask = None - if input_lengths is not None and is_training: - input_mask = tf.sequence_mask( - input_lengths, tf.shape(inputs)[1], dtype=tf.float32) - - if input_mask is not None: - inputs = inputs * tf.expand_dims(input_mask, -1) - - # speaker embedding - embedded_inputs_speaker = tf.layers.dense( - inputs_speaker, - 32, - activation=None, - use_bias=False, - kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) - - # emotion embedding - embedded_inputs_emotion = tf.layers.dense( - inputs_emotion, - 32, - activation=None, - use_bias=False, - kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) - - # symbol embedding - with tf.variable_scope('Embedding'): - embedded_inputs = tf.layers.dense( - inputs, - hp.embedding_dim, - activation=None, - use_bias=False, - kernel_initializer=tf.truncated_normal_initializer( - stddev=0.5)) - - # Encoder - with tf.variable_scope('Encoder'): - Encoder = SelfAttentionEncoder( - num_layers=hp.encoder_num_layers, - num_units=hp.encoder_num_units, - num_heads=hp.encoder_num_heads, - ffn_inner_dim=hp.encoder_ffn_inner_dim, - dropout=hp.encoder_dropout, - attention_dropout=hp.encoder_attention_dropout, - relu_dropout=hp.encoder_relu_dropout) - encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode( - embedded_inputs, - sequence_length=input_lengths, - mode=is_training) - encoder_outputs = tf.layers.dense( - encoder_outputs, - hp.encoder_projection_units, - activation=None, - use_bias=False, - kernel_initializer=tf.truncated_normal_initializer( - stddev=0.5)) - - # pitch and energy - var_inputs = tf.concat([ - encoder_outputs, embedded_inputs_speaker, - embedded_inputs_emotion - ], 2) - if input_mask is not None: - var_inputs = var_inputs * tf.expand_dims(input_mask, -1) - - with tf.variable_scope('Pitch_Predictor'): - Pitch_Predictor_FSMN = FsmnEncoderV2( - filter_size=hp.predictor_filter_size, - fsmn_num_layers=hp.predictor_fsmn_num_layers, - dnn_num_layers=hp.predictor_dnn_num_layers, - num_memory_units=hp.predictor_num_memory_units, - ffn_inner_dim=hp.predictor_ffn_inner_dim, - dropout=hp.predictor_dropout, - shift=hp.predictor_shift, - position_encoder=None) - pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode( - tf.concat([ - encoder_outputs, embedded_inputs_speaker, - embedded_inputs_emotion - ], 2), - sequence_length=input_lengths, - mode=is_training) - pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( - LSTMBlockCell(hp.predictor_lstm_units), - LSTMBlockCell(hp.predictor_lstm_units), - pitch_contour_outputs, - sequence_length=input_lengths, - dtype=tf.float32) - pitch_contour_outputs = tf.concat( - pitch_contour_outputs, axis=-1) - pitch_contour_outputs = tf.layers.dense( - pitch_contour_outputs, units=1) # [N, T_in, 1] - pitch_contour_outputs = tf.squeeze( - pitch_contour_outputs, axis=2) # [N, T_in] - - with tf.variable_scope('Energy_Predictor'): - Energy_Predictor_FSMN = FsmnEncoderV2( - filter_size=hp.predictor_filter_size, - fsmn_num_layers=hp.predictor_fsmn_num_layers, - dnn_num_layers=hp.predictor_dnn_num_layers, - num_memory_units=hp.predictor_num_memory_units, - ffn_inner_dim=hp.predictor_ffn_inner_dim, - dropout=hp.predictor_dropout, - shift=hp.predictor_shift, - position_encoder=None) - energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode( - tf.concat([ - encoder_outputs, embedded_inputs_speaker, - embedded_inputs_emotion - ], 2), - sequence_length=input_lengths, - mode=is_training) - energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( - LSTMBlockCell(hp.predictor_lstm_units), - LSTMBlockCell(hp.predictor_lstm_units), - energy_contour_outputs, - sequence_length=input_lengths, - dtype=tf.float32) - energy_contour_outputs = tf.concat( - energy_contour_outputs, axis=-1) - energy_contour_outputs = tf.layers.dense( - energy_contour_outputs, units=1) # [N, T_in, 1] - energy_contour_outputs = tf.squeeze( - energy_contour_outputs, axis=2) # [N, T_in] - - if is_training: - pitch_embeddings = tf.expand_dims( - pitch_contours, axis=2) # [N, T_in, 1] - pitch_embeddings = tf.layers.conv1d( - pitch_embeddings, - filters=hp.encoder_projection_units, - kernel_size=9, - padding='same', - name='pitch_embeddings') # [N, T_in, 32] - - energy_embeddings = tf.expand_dims( - energy_contours, axis=2) # [N, T_in, 1] - energy_embeddings = tf.layers.conv1d( - energy_embeddings, - filters=hp.encoder_projection_units, - kernel_size=9, - padding='same', - name='energy_embeddings') # [N, T_in, 32] - else: - pitch_contour_outputs *= pitch_scales - pitch_embeddings = tf.expand_dims( - pitch_contour_outputs, axis=2) # [N, T_in, 1] - pitch_embeddings = tf.layers.conv1d( - pitch_embeddings, - filters=hp.encoder_projection_units, - kernel_size=9, - padding='same', - name='pitch_embeddings') # [N, T_in, 32] - - energy_contour_outputs *= energy_scales - energy_embeddings = tf.expand_dims( - energy_contour_outputs, axis=2) # [N, T_in, 1] - energy_embeddings = tf.layers.conv1d( - energy_embeddings, - filters=hp.encoder_projection_units, - kernel_size=9, - padding='same', - name='energy_embeddings') # [N, T_in, 32] - - encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings - - # duration - dur_inputs = tf.concat([ - encoder_outputs_, embedded_inputs_speaker, - embedded_inputs_emotion - ], 2) - if input_mask is not None: - dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1) - with tf.variable_scope('Duration_Predictor'): - duration_predictor_cell = MultiRNNCell([ - LSTMBlockCell(hp.predictor_lstm_units), - LSTMBlockCell(hp.predictor_lstm_units) - ], state_is_tuple=True) # yapf:disable - from .rnn_wrappers import DurPredictorCell - duration_output_cell = DurPredictorCell( - duration_predictor_cell, is_training, 1, - hp.predictor_prenet_units) - duration_predictor_init_state = duration_output_cell.zero_state( - batch_size=batch_size, dtype=tf.float32) - if is_training: - from .helpers import VarTrainingHelper - duration_helper = VarTrainingHelper( - tf.expand_dims( - tf.log(tf.cast(durations, tf.float32) + 1), - axis=2), dur_inputs, 1) - else: - from .helpers import VarTestHelper - duration_helper = VarTestHelper(batch_size, dur_inputs, 1) - ( - duration_outputs, _ - ), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode( - BasicDecoder(duration_output_cell, duration_helper, - duration_predictor_init_state), - maximum_iterations=1000) - duration_outputs = tf.squeeze( - duration_outputs, axis=2) # [N, T_in] - if input_mask is not None: - duration_outputs = duration_outputs * input_mask - duration_outputs_ = tf.exp(duration_outputs) - 1 - - # Length Regulator - with tf.variable_scope('Length_Regulator'): - if is_training: - i = tf.constant(1) - # position embedding - j = tf.constant(1) - dur_len = tf.shape(durations)[-1] - embedded_position_i = tf.range(1, durations[0, 0] + 1) - - def condition_pos(j, e): - return tf.less(j, dur_len) - - def loop_body_pos(j, embedded_position_i): - embedded_position_i = tf.concat([ - embedded_position_i, - tf.range(1, durations[0, j] + 1) - ], axis=0) # yapf:disable - return [j + 1, embedded_position_i] - - j, embedded_position_i = tf.while_loop( - condition_pos, - loop_body_pos, [j, embedded_position_i], - shape_invariants=[ - j.get_shape(), - tf.TensorShape([None]) - ]) - embedded_position = tf.reshape(embedded_position_i, - (1, -1)) - - # others - LR_outputs = repeat( - encoder_outputs_[0:1, :, :], durations[0, :], axis=1) - embedded_outputs_speaker = repeat( - embedded_inputs_speaker[0:1, :, :], - durations[0, :], - axis=1) - embedded_outputs_emotion = repeat( - embedded_inputs_emotion[0:1, :, :], - durations[0, :], - axis=1) - - def condition(i, pos, layer, s, e): - return tf.less(i, tf.shape(mel_targets)[0]) - - def loop_body(i, embedded_position, LR_outputs, - embedded_outputs_speaker, - embedded_outputs_emotion): - # position embedding - jj = tf.constant(1) - embedded_position_i = tf.range(1, durations[i, 0] + 1) - - def condition_pos_i(j, e): - return tf.less(j, dur_len) - - def loop_body_pos_i(j, embedded_position_i): - embedded_position_i = tf.concat([ - embedded_position_i, - tf.range(1, durations[i, j] + 1) - ], axis=0) # yapf:disable - return [j + 1, embedded_position_i] - - jj, embedded_position_i = tf.while_loop( - condition_pos_i, - loop_body_pos_i, [jj, embedded_position_i], - shape_invariants=[ - jj.get_shape(), - tf.TensorShape([None]) - ]) - embedded_position = tf.concat([ - embedded_position, - tf.reshape(embedded_position_i, (1, -1)) - ], 0) - - # others - LR_outputs = tf.concat([ - LR_outputs, - repeat( - encoder_outputs_[i:i + 1, :, :], - durations[i, :], - axis=1) - ], 0) - embedded_outputs_speaker = tf.concat([ - embedded_outputs_speaker, - repeat( - embedded_inputs_speaker[i:i + 1, :, :], - durations[i, :], - axis=1) - ], 0) - embedded_outputs_emotion = tf.concat([ - embedded_outputs_emotion, - repeat( - embedded_inputs_emotion[i:i + 1, :, :], - durations[i, :], - axis=1) - ], 0) - return [ - i + 1, embedded_position, LR_outputs, - embedded_outputs_speaker, embedded_outputs_emotion - ] - - i, embedded_position, LR_outputs, - embedded_outputs_speaker, - embedded_outputs_emotion = tf.while_loop( - condition, - loop_body, [ - i, embedded_position, LR_outputs, - embedded_outputs_speaker, embedded_outputs_emotion - ], - shape_invariants=[ - i.get_shape(), - tf.TensorShape([None, None]), - tf.TensorShape([None, None, None]), - tf.TensorShape([None, None, None]), - tf.TensorShape([None, None, None]) - ], - parallel_iterations=hp.batch_size) - - ori_framenum = tf.shape(mel_targets)[1] - else: - # position - j = tf.constant(1) - dur_len = tf.shape(duration_outputs_)[-1] - embedded_position_i = tf.range( - 1, - tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32) - + 1) - - def condition_pos(j, e): - return tf.less(j, dur_len) - - def loop_body_pos(j, embedded_position_i): - embedded_position_i = tf.concat([ - embedded_position_i, - tf.range( - 1, - tf.cast( - tf.round(duration_outputs_)[0, j], - tf.int32) + 1) - ], axis=0) # yapf:disable - return [j + 1, embedded_position_i] - - j, embedded_position_i = tf.while_loop( - condition_pos, - loop_body_pos, [j, embedded_position_i], - shape_invariants=[ - j.get_shape(), - tf.TensorShape([None]) - ]) - embedded_position = tf.reshape(embedded_position_i, - (1, -1)) - # others - duration_outputs_ *= duration_scales - LR_outputs = repeat( - encoder_outputs_[0:1, :, :], - tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), - axis=1) - embedded_outputs_speaker = repeat( - embedded_inputs_speaker[0:1, :, :], - tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), - axis=1) - embedded_outputs_emotion = repeat( - embedded_inputs_emotion[0:1, :, :], - tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), - axis=1) - ori_framenum = tf.shape(LR_outputs)[1] - - left = hp.outputs_per_step - tf.mod( - ori_framenum, hp.outputs_per_step) - LR_outputs = tf.cond( - tf.equal(left, - hp.outputs_per_step), lambda: LR_outputs, - lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]], - 'CONSTANT')) - embedded_outputs_speaker = tf.cond( - tf.equal(left, hp.outputs_per_step), - lambda: embedded_outputs_speaker, lambda: tf.pad( - embedded_outputs_speaker, [[0, 0], [0, left], - [0, 0]], 'CONSTANT')) - embedded_outputs_emotion = tf.cond( - tf.equal(left, hp.outputs_per_step), - lambda: embedded_outputs_emotion, lambda: tf.pad( - embedded_outputs_emotion, [[0, 0], [0, left], - [0, 0]], 'CONSTANT')) - embedded_position = tf.cond( - tf.equal(left, hp.outputs_per_step), - lambda: embedded_position, - lambda: tf.pad(embedded_position, [[0, 0], [0, left]], - 'CONSTANT')) - - # Pos_Embedding - with tf.variable_scope('Position_Embedding'): - Pos_Embedding = BatchSinusodalPositionalEncoding() - position_embeddings = Pos_Embedding.positional_encoding( - batch_size, - tf.shape(LR_outputs)[1], hp.encoder_projection_units, - embedded_position) - LR_outputs += position_embeddings - - # multi-frame - LR_outputs = tf.reshape(LR_outputs, [ - batch_size, -1, - hp.outputs_per_step * hp.encoder_projection_units - ]) - embedded_outputs_speaker = tf.reshape( - embedded_outputs_speaker, - [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] - embedded_outputs_emotion = tf.reshape( - embedded_outputs_emotion, - [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] - # [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64) - LR_outputs = tf.concat([ - LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion - ], -1) - - # auto bandwidth - if is_training: - durations_mask = tf.cast(durations, - tf.float32) * input_mask # [N, T_in] - else: - durations_mask = duration_outputs_ - X_band_width = tf.cast( - tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step), - tf.int32) - H_band_width = X_band_width - - with tf.variable_scope('Decoder'): - Decoder = SelfAttentionDecoder( - num_layers=hp.decoder_num_layers, - num_units=hp.decoder_num_units, - num_heads=hp.decoder_num_heads, - ffn_inner_dim=hp.decoder_ffn_inner_dim, - dropout=hp.decoder_dropout, - attention_dropout=hp.decoder_attention_dropout, - relu_dropout=hp.decoder_relu_dropout, - prenet_units=hp.prenet_units, - dense_units=hp.prenet_proj_units, - num_mels=hp.num_mels, - outputs_per_step=hp.outputs_per_step, - X_band_width=X_band_width, - H_band_width=H_band_width, - position_encoder=None) - if is_training: - if hp.free_run: - r = hp.outputs_per_step - init_decoder_input = tf.expand_dims( - tf.tile([[0.0]], [batch_size, hp.num_mels]), - axis=1) # [N, 1, hp.num_mels] - decoder_input_lengths = tf.cast( - output_lengths / r, tf.int32) - decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( - init_decoder_input, - maximum_iterations=tf.shape(LR_outputs)[1], - mode=is_training, - memory=LR_outputs, - memory_sequence_length=decoder_input_lengths) - else: - r = hp.outputs_per_step - decoder_input = mel_targets[:, r - 1:: - r, :] # [N, T_out / r, hp.num_mels] - init_decoder_input = tf.expand_dims( - tf.tile([[0.0]], [batch_size, hp.num_mels]), - axis=1) # [N, 1, hp.num_mels] - decoder_input = tf.concat( - [init_decoder_input, decoder_input], - axis=1) # [N, T_out / r + 1, hp.num_mels] - decoder_input = decoder_input[:, : - -1, :] # [N, T_out / r, hp.num_mels] - decoder_input_lengths = tf.cast( - output_lengths / r, tf.int32) - decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs( - decoder_input, - decoder_input_lengths, - mode=is_training, - memory=LR_outputs, - memory_sequence_length=decoder_input_lengths) - else: - init_decoder_input = tf.expand_dims( - tf.tile([[0.0]], [batch_size, hp.num_mels]), - axis=1) # [N, 1, hp.num_mels] - decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( - init_decoder_input, - maximum_iterations=tf.shape(LR_outputs)[1], - mode=is_training, - memory=LR_outputs, - memory_sequence_length=tf.expand_dims( - tf.shape(LR_outputs)[1], axis=0)) - - if is_training: - mel_outputs_ = tf.reshape(decoder_outputs, - [batch_size, -1, hp.num_mels]) - else: - mel_outputs_ = tf.reshape( - decoder_outputs, - [batch_size, -1, hp.num_mels])[:, :ori_framenum, :] - mel_outputs = mel_outputs_ - - with tf.variable_scope('Postnet'): - Postnet_FSMN = FsmnEncoderV2( - filter_size=hp.postnet_filter_size, - fsmn_num_layers=hp.postnet_fsmn_num_layers, - dnn_num_layers=hp.postnet_dnn_num_layers, - num_memory_units=hp.postnet_num_memory_units, - ffn_inner_dim=hp.postnet_ffn_inner_dim, - dropout=hp.postnet_dropout, - shift=hp.postnet_shift, - position_encoder=None) - if is_training: - postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( - mel_outputs, - sequence_length=output_lengths, - mode=is_training) - hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( - LSTMBlockCell(hp.postnet_lstm_units), - postnet_fsmn_outputs, - sequence_length=output_lengths, - dtype=tf.float32) - else: - postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( - mel_outputs, - sequence_length=[tf.shape(mel_outputs_)[1]], - mode=is_training) - hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( - LSTMBlockCell(hp.postnet_lstm_units), - postnet_fsmn_outputs, - sequence_length=[tf.shape(mel_outputs_)[1]], - dtype=tf.float32) - - mel_residual_outputs = tf.layers.dense( - hidden_lstm_outputs, units=hp.num_mels) - mel_outputs += mel_residual_outputs - - self.inputs = inputs - self.inputs_speaker = inputs_speaker - self.inputs_emotion = inputs_emotion - self.input_lengths = input_lengths - self.durations = durations - self.output_lengths = output_lengths - self.mel_outputs_ = mel_outputs_ - self.mel_outputs = mel_outputs - self.mel_targets = mel_targets - self.duration_outputs = duration_outputs - self.duration_outputs_ = duration_outputs_ - self.duration_scales = duration_scales - self.pitch_contour_outputs = pitch_contour_outputs - self.pitch_contours = pitch_contours - self.pitch_scales = pitch_scales - self.energy_contour_outputs = energy_contour_outputs - self.energy_contours = energy_contours - self.energy_scales = energy_scales - self.uv_masks_ = uv_masks - - self.embedded_inputs_emotion = embedded_inputs_emotion - self.embedding_fsmn_outputs = embedded_inputs - self.encoder_outputs = encoder_outputs - self.encoder_outputs_ = encoder_outputs_ - self.LR_outputs = LR_outputs - self.postnet_fsmn_outputs = postnet_fsmn_outputs - - self.pitch_embeddings = pitch_embeddings - self.energy_embeddings = energy_embeddings - - self.attns = attns - self.attention_x = attention_x - self.attention_h = attention_h - self.X_band_width = X_band_width - self.H_band_width = H_band_width - - def add_loss(self): - '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' - with tf.variable_scope('loss') as _: - hp = self._hparams - mask = tf.sequence_mask( - self.output_lengths, - tf.shape(self.mel_targets)[1], - dtype=tf.float32) - valid_outputs = tf.reduce_sum(mask) - - mask_input = tf.sequence_mask( - self.input_lengths, - tf.shape(self.durations)[1], - dtype=tf.float32) - valid_inputs = tf.reduce_sum(mask_input) - - # mel loss - if self.uv_masks_ is not None: - valid_outputs_mask = tf.reduce_sum( - tf.expand_dims(mask, -1) * self.uv_masks_) - self.mel_loss_ = tf.reduce_sum( - tf.abs(self.mel_targets - self.mel_outputs_) - * tf.expand_dims(mask, -1) * self.uv_masks_) / ( - valid_outputs_mask * hp.num_mels) - self.mel_loss = tf.reduce_sum( - tf.abs(self.mel_targets - self.mel_outputs) - * tf.expand_dims(mask, -1) * self.uv_masks_) / ( - valid_outputs_mask * hp.num_mels) - else: - self.mel_loss_ = tf.reduce_sum( - tf.abs(self.mel_targets - self.mel_outputs_) - * tf.expand_dims(mask, -1)) / ( - valid_outputs * hp.num_mels) - self.mel_loss = tf.reduce_sum( - tf.abs(self.mel_targets - self.mel_outputs) - * tf.expand_dims(mask, -1)) / ( - valid_outputs * hp.num_mels) - - # duration loss - self.duration_loss = tf.reduce_sum( - tf.abs( - tf.log(tf.cast(self.durations, tf.float32) + 1) - - self.duration_outputs) * mask_input) / valid_inputs - - # pitch contour loss - self.pitch_contour_loss = tf.reduce_sum( - tf.abs(self.pitch_contours - self.pitch_contour_outputs) - * mask_input) / valid_inputs - - # energy contour loss - self.energy_contour_loss = tf.reduce_sum( - tf.abs(self.energy_contours - self.energy_contour_outputs) - * mask_input) / valid_inputs - - # final loss - self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \ - + self.pitch_contour_loss + self.energy_contour_loss - - # guided attention loss - self.guided_attention_loss = tf.constant(0.0) - if hp.guided_attention: - i0 = tf.constant(0) - loss0 = tf.constant(0.0) - - def c(i, _): - return tf.less(i, tf.shape(mel_targets)[0]) - - def loop_body(i, loss): - decoder_input_lengths = tf.cast( - self.output_lengths / hp.outputs_per_step, tf.int32) - input_len = decoder_input_lengths[i] - output_len = decoder_input_lengths[i] - input_w = tf.expand_dims( - tf.range(tf.cast(input_len, dtype=tf.float32)), - axis=1) / tf.cast( - input_len, dtype=tf.float32) # [T_in, 1] - output_w = tf.expand_dims( - tf.range(tf.cast(output_len, dtype=tf.float32)), - axis=0) / tf.cast( - output_len, dtype=tf.float32) # [1, T_out] - guided_attention_w = 1.0 - tf.exp( - -(1 / hp.guided_attention_2g_squared) - * tf.square(input_w - output_w)) # [T_in, T_out] - guided_attention_w = tf.expand_dims( - guided_attention_w, axis=0) # [1, T_in, T_out] - # [hp.decoder_num_heads, T_in, T_out] - guided_attention_w = tf.tile(guided_attention_w, - [hp.decoder_num_heads, 1, 1]) - loss_i = tf.constant(0.0) - for j in range(hp.decoder_num_layers): - loss_i += tf.reduce_mean( - self.attention_h[j][i, :, :input_len, :output_len] - * guided_attention_w) - - return [tf.add(i, 1), tf.add(loss, loss_i)] - - _, loss = tf.while_loop( - c, - loop_body, - loop_vars=[i0, loss0], - parallel_iterations=hp.batch_size) - self.guided_attention_loss = loss / hp.batch_size - self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss - - def add_optimizer(self, global_step): - '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. - - Args: - global_step: int32 scalar Tensor representing current global step in training - ''' - with tf.variable_scope('optimizer') as _: - hp = self._hparams - if hp.decay_learning_rate: - self.learning_rate = _learning_rate_decay( - hp.initial_learning_rate, global_step) - else: - self.learning_rate = tf.convert_to_tensor( - hp.initial_learning_rate) - optimizer = tf.train.AdamOptimizer(self.learning_rate, - hp.adam_beta1, hp.adam_beta2) - gradients, variables = zip(*optimizer.compute_gradients(self.loss)) - self.gradients = gradients - clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) - - # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: - # https://github.com/tensorflow/tensorflow/issues/1122 - with tf.control_dependencies( - tf.get_collection(tf.GraphKeys.UPDATE_OPS)): - self.optimize = optimizer.apply_gradients( - zip(clipped_gradients, variables), global_step=global_step) - - -def _learning_rate_decay(init_lr, global_step): - # Noam scheme from tensor2tensor: - warmup_steps = 4000.0 - step = tf.cast(global_step + 1, dtype=tf.float32) - return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, - step**-0.5) diff --git a/modelscope/models/audio/tts/models/self_attention_decoder.py b/modelscope/models/audio/tts/models/self_attention_decoder.py deleted file mode 100755 index 9cf3fcaa..00000000 --- a/modelscope/models/audio/tts/models/self_attention_decoder.py +++ /dev/null @@ -1,817 +0,0 @@ -"""Define self-attention decoder.""" - -import sys - -import tensorflow as tf - -from . import compat, transformer -from .am_models import decoder_prenet -from .position import SinusoidalPositionEncoder - - -class SelfAttentionDecoder(): - """Decoder using self-attention as described in - https://arxiv.org/abs/1706.03762. - """ - - def __init__(self, - num_layers, - num_units=512, - num_heads=8, - ffn_inner_dim=2048, - dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - prenet_units=256, - dense_units=128, - num_mels=80, - outputs_per_step=3, - X_band_width=None, - H_band_width=None, - position_encoder=SinusoidalPositionEncoder(), - self_attention_type='scaled_dot'): - """Initializes the parameters of the decoder. - - Args: - num_layers: The number of layers. - num_units: The number of hidden units. - num_heads: The number of heads in the multi-head attention. - ffn_inner_dim: The number of units of the inner linear transformation - in the feed forward layer. - dropout: The probability to drop units from the outputs. - attention_dropout: The probability to drop units from the attention. - relu_dropout: The probability to drop units from the ReLU activation in - the feed forward layer. - position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to - apply on inputs or ``None``. - self_attention_type: Type of self attention, "scaled_dot" or "average" (case - insensitive). - - Raises: - ValueError: if :obj:`self_attention_type` is invalid. - """ - super(SelfAttentionDecoder, self).__init__() - self.num_layers = num_layers - self.num_units = num_units - self.num_heads = num_heads - self.ffn_inner_dim = ffn_inner_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.relu_dropout = relu_dropout - self.position_encoder = position_encoder - self.self_attention_type = self_attention_type.lower() - if self.self_attention_type not in ('scaled_dot', 'average'): - raise ValueError('invalid attention type %s' - % self.self_attention_type) - if self.self_attention_type == 'average': - tf.logging.warning( - 'Support for average attention network is experimental ' - 'and may change in future versions.') - self.prenet_units = prenet_units - self.dense_units = dense_units - self.num_mels = num_mels - self.outputs_per_step = outputs_per_step - self.X_band_width = X_band_width - self.H_band_width = H_band_width - - @property - def output_size(self): - """Returns the decoder output size.""" - return self.num_units - - @property - def support_alignment_history(self): - return True - - @property - def support_multi_source(self): - return True - - def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): - cache = {} - - for layer in range(self.num_layers): - proj_cache_shape = [ - batch_size, self.num_heads, 0, self.num_units // self.num_heads - ] - layer_cache = {} - layer_cache['memory'] = [{ - 'memory_keys': - tf.zeros(proj_cache_shape, dtype=dtype), - 'memory_values': - tf.zeros(proj_cache_shape, dtype=dtype) - } for _ in range(num_sources)] - if self.self_attention_type == 'scaled_dot': - layer_cache['self_keys'] = tf.zeros( - proj_cache_shape, dtype=dtype) - layer_cache['self_values'] = tf.zeros( - proj_cache_shape, dtype=dtype) - elif self.self_attention_type == 'average': - layer_cache['prev_g'] = tf.zeros( - [batch_size, 1, self.num_units], dtype=dtype) - cache['layer_{}'.format(layer)] = layer_cache - - return cache - - def _init_attn(self, dtype=tf.float32): - attn = [] - for layer in range(self.num_layers): - attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True)) - return attn - - def _self_attention_stack(self, - inputs, - sequence_length=None, - mode=True, - cache=None, - memory=None, - memory_sequence_length=None, - step=None): - - # [N, T_out, self.dense_units] or [N, 1, self.dense_units] - prenet_outputs = decoder_prenet(inputs, self.prenet_units, - self.dense_units, mode) - if step is None: - decoder_inputs = tf.concat( - [memory, prenet_outputs], - axis=-1) # [N, T_out, memory_size + self.dense_units] - else: - decoder_inputs = tf.concat( - [memory[:, step:step + 1, :], prenet_outputs], - axis=-1) # [N, 1, memory_size + self.dense_units] - decoder_inputs = tf.layers.dense( - decoder_inputs, units=self.dense_units) - - inputs = decoder_inputs - inputs *= self.num_units**0.5 - if self.position_encoder is not None: - inputs = self.position_encoder( - inputs, position=step + 1 if step is not None else None) - - inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) - - decoder_mask = None - memory_mask = None - # last_attention = None - - X_band_width_tmp = -1 - H_band_width_tmp = -1 - if self.X_band_width is not None: - X_band_width_tmp = tf.cast( - tf.cond( - tf.less(tf.shape(memory)[1], self.X_band_width), - lambda: -1, lambda: self.X_band_width), - dtype=tf.int64) - if self.H_band_width is not None: - H_band_width_tmp = tf.cast( - tf.cond( - tf.less(tf.shape(memory)[1], self.H_band_width), - lambda: -1, lambda: self.H_band_width), - dtype=tf.int64) - - if self.self_attention_type == 'scaled_dot': - if sequence_length is not None: - decoder_mask = transformer.build_future_mask( - sequence_length, - num_heads=self.num_heads, - maximum_length=tf.shape(inputs)[1], - band=X_band_width_tmp) # [N, 1, T_out, T_out] - elif self.self_attention_type == 'average': - if cache is None: - if sequence_length is None: - sequence_length = tf.fill([tf.shape(inputs)[0]], - tf.shape(inputs)[1]) - decoder_mask = transformer.cumulative_average_mask( - sequence_length, - maximum_length=tf.shape(inputs)[1], - dtype=inputs.dtype) - - if memory is not None and not tf.contrib.framework.nest.is_sequence( - memory): - memory = (memory, ) - if memory_sequence_length is not None: - if not tf.contrib.framework.nest.is_sequence( - memory_sequence_length): - memory_sequence_length = (memory_sequence_length, ) - if step is None: - memory_mask = [ - transformer.build_history_mask( - length, - num_heads=self.num_heads, - maximum_length=tf.shape(m)[1], - band=H_band_width_tmp) - for m, length in zip(memory, memory_sequence_length) - ] - else: - memory_mask = [ - transformer.build_history_mask( - length, - num_heads=self.num_heads, - maximum_length=tf.shape(m)[1], - band=H_band_width_tmp)[:, :, step:step + 1, :] - for m, length in zip(memory, memory_sequence_length) - ] - - # last_attention = None - attns_x = [] - attns_h = [] - for layer in range(self.num_layers): - layer_name = 'layer_{}'.format(layer) - layer_cache = cache[layer_name] if cache is not None else None - with tf.variable_scope(layer_name): - if memory is not None: - for i, (mem, mask) in enumerate(zip(memory, memory_mask)): - memory_cache = None - if layer_cache is not None: - memory_cache = layer_cache['memory'][i] - scope_name = 'multi_head_{}'.format(i) - if i == 0: - scope_name = 'multi_head' - with tf.variable_scope(scope_name): - encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA( - self.num_heads, - transformer.norm(inputs), - mem, - mode, - num_units=self.num_units, - mask=decoder_mask, - mask_h=mask, - cache=layer_cache, - cache_h=memory_cache, - dropout=self.attention_dropout, - return_attention=True, - layer_name=layer_name, - X_band_width=self.X_band_width) - attns_x.append(attn_x) - attns_h.append(attn_h) - context = transformer.drop_and_add( - inputs, encoded, mode, dropout=self.dropout) - - with tf.variable_scope('ffn'): - transformed = transformer.feed_forward_ori( - transformer.norm(context), - self.ffn_inner_dim, - mode, - dropout=self.relu_dropout) - transformed = transformer.drop_and_add( - context, transformed, mode, dropout=self.dropout) - - inputs = transformed - - outputs = transformer.norm(inputs) - outputs = tf.layers.dense( - outputs, units=self.num_mels * self.outputs_per_step) - return outputs, attns_x, attns_h - - def decode_from_inputs(self, - inputs, - sequence_length, - initial_state=None, - mode=True, - memory=None, - memory_sequence_length=None): - outputs, attention_x, attention_h = self._self_attention_stack( - inputs, - sequence_length=sequence_length, - mode=mode, - memory=memory, - memory_sequence_length=memory_sequence_length) - return outputs, attention_x, attention_h - - def step_fn(self, - mode, - batch_size, - initial_state=None, - memory=None, - memory_sequence_length=None, - dtype=tf.float32): - if memory is None: - num_sources = 0 - elif tf.contrib.framework.nest.is_sequence(memory): - num_sources = len(memory) - else: - num_sources = 1 - cache = self._init_cache( - batch_size, dtype=dtype, num_sources=num_sources) - attention_x = self._init_attn(dtype=dtype) - attention_h = self._init_attn(dtype=dtype) - - def _fn(step, inputs, cache): - outputs, attention_x, attention_h = self._self_attention_stack( - inputs, - mode=mode, - cache=cache, - memory=memory, - memory_sequence_length=memory_sequence_length, - step=step) - attention_x_tmp = [] - for layer in range(len(attention_h)): - attention_x_tmp_l = tf.zeros_like(attention_h[layer]) - if self.X_band_width is not None: - pred = tf.less(step, self.X_band_width + 1) - attention_x_tmp_l_1 = tf.cond(pred, # yapf:disable - lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer], - lambda: tf.concat([ - attention_x_tmp_l[:, :, :, - :step - self.X_band_width], - attention_x_tmp_l[:, :, :, - step - self.X_band_width:step + 1] - + attention_x[layer]], - axis=-1)) # yapf:disable - attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] - attention_x_tmp.append( - tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2], - axis=-1)) - else: - attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1] - attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] - attention_x_tmp.append( - tf.concat([ - attention_x_tmp_l_1 + attention_x[layer], - attention_x_tmp_l_2 - ], axis=-1)) # yapf:disable - attention_x = attention_x_tmp - return outputs, cache, attention_x, attention_h - - return _fn, cache, attention_x, attention_h - - def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations, - mode, memory, memory_sequence_length): - batch_size = tf.shape(init_decoder_input)[0] - step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( - mode, - batch_size, - memory=memory, - memory_sequence_length=memory_sequence_length) - - outputs, attention_x, attention_h, cache = self.dynamic_decode( - step_fn, - init_decoder_input, - init_cache=init_cache, - init_attn_x=init_attn_x, - init_attn_h=init_attn_h, - maximum_iterations=maximum_iterations, - batch_size=batch_size) - return outputs, attention_x, attention_h - - def dynamic_decode_and_search_teacher_forcing(self, decoder_input, - maximum_iterations, mode, - memory, - memory_sequence_length): - batch_size = tf.shape(decoder_input)[0] - step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( - mode, - batch_size, - memory=memory, - memory_sequence_length=memory_sequence_length) - - outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing( - step_fn, - decoder_input, - init_cache=init_cache, - init_attn_x=init_attn_x, - init_attn_h=init_attn_h, - maximum_iterations=maximum_iterations, - batch_size=batch_size) - return outputs, attention_x, attention_h - - def dynamic_decode(self, - step_fn, - init_decoder_input, - init_cache=None, - init_attn_x=None, - init_attn_h=None, - maximum_iterations=None, - batch_size=None): - - def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument - return tf.less(step, maximum_iterations) - - def _body(step, cache, inputs, outputs, attention_x, attention_h): - # output: [1, 1, num_mels * r] - # attn: [1, 1, T_out] - output, cache, attn_x, attn_h = step_fn( - step, inputs, cache) # outputs, cache, attention, attns - for layer in range(len(attention_x)): - attention_x[layer] = attention_x[layer].write( - step, tf.cast(attn_x[layer], tf.float32)) - - for layer in range(len(attention_h)): - attention_h[layer] = attention_h[layer].write( - step, tf.cast(attn_h[layer], tf.float32)) - - outputs = outputs.write(step, tf.cast(output, tf.float32)) - return step + 1, cache, output[:, :, -self. - num_mels:], outputs, attention_x, attention_h - - step = tf.constant(0, dtype=tf.int32) - outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) - - _, cache, _, outputs, attention_x, attention_h = tf.while_loop( - _cond, - _body, - loop_vars=(step, init_cache, init_decoder_input, outputs, - init_attn_x, init_attn_h), - shape_invariants=(step.shape, - compat.nest.map_structure( - self._get_shape_invariants, init_cache), - compat.nest.map_structure( - self._get_shape_invariants, - init_decoder_input), tf.TensorShape(None), - compat.nest.map_structure( - self._get_shape_invariants, init_attn_x), - compat.nest.map_structure( - self._get_shape_invariants, init_attn_h)), - parallel_iterations=1, - back_prop=False, - maximum_iterations=maximum_iterations) - # element of outputs: [N, 1, num_mels * r] - outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] - outputs_stack = tf.transpose( - outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] - outputs_stack = tf.squeeze( - outputs_stack, axis=0) # [N, T_out, num_mels * r] - - attention_x_stack = [] - for layer in range(len(attention_x)): - attention_x_stack_tmp = attention_x[layer].stack( - ) # [T_out, N, H, 1, T_out] - attention_x_stack_tmp = tf.transpose( - attention_x_stack_tmp, perm=[3, 1, 2, 0, - 4]) # [1, N, H, T_out, T_out] - attention_x_stack_tmp = tf.squeeze( - attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] - attention_x_stack.append(attention_x_stack_tmp) - - attention_h_stack = [] - for layer in range(len(attention_h)): - attention_h_stack_tmp = attention_h[layer].stack( - ) # [T_out, N, H, 1, T_out] - attention_h_stack_tmp = tf.transpose( - attention_h_stack_tmp, perm=[3, 1, 2, 0, - 4]) # [1, N, H, T_out, T_out] - attention_h_stack_tmp = tf.squeeze( - attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] - attention_h_stack.append(attention_h_stack_tmp) - - return outputs_stack, attention_x_stack, attention_h_stack, cache - - def dynamic_decode_teacher_forcing(self, - step_fn, - decoder_input, - init_cache=None, - init_attn_x=None, - init_attn_h=None, - maximum_iterations=None, - batch_size=None): - - def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument - return tf.less(step, maximum_iterations) - - def _body(step, cache, inputs, outputs, attention_x, attention_h): - # output: [1, 1, num_mels * r] - # attn: [1, 1, T_out] - output, cache, attn_x, attn_h = step_fn( - step, inputs[:, step:step + 1, :], - cache) # outputs, cache, attention, attns - for layer in range(len(attention_x)): - attention_x[layer] = attention_x[layer].write( - step, tf.cast(attn_x[layer], tf.float32)) - - for layer in range(len(attention_h)): - attention_h[layer] = attention_h[layer].write( - step, tf.cast(attn_h[layer], tf.float32)) - outputs = outputs.write(step, tf.cast(output, tf.float32)) - return step + 1, cache, inputs, outputs, attention_x, attention_h - - step = tf.constant(0, dtype=tf.int32) - outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) - - _, cache, _, outputs, attention_x, attention_h = tf.while_loop( - _cond, - _body, - loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x, - init_attn_h), - shape_invariants=(step.shape, - compat.nest.map_structure( - self._get_shape_invariants, - init_cache), decoder_input.shape, - tf.TensorShape(None), - compat.nest.map_structure( - self._get_shape_invariants, init_attn_x), - compat.nest.map_structure( - self._get_shape_invariants, init_attn_h)), - parallel_iterations=1, - back_prop=False, - maximum_iterations=maximum_iterations) - # element of outputs: [N, 1, num_mels * r] - outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] - outputs_stack = tf.transpose( - outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] - outputs_stack = tf.squeeze( - outputs_stack, axis=0) # [N, T_out, num_mels * r] - - attention_x_stack = [] - for layer in range(len(attention_x)): - attention_x_stack_tmp = attention_x[layer].stack( - ) # [T_out, N, H, 1, T_out] - attention_x_stack_tmp = tf.transpose( - attention_x_stack_tmp, perm=[3, 1, 2, 0, - 4]) # [1, N, H, T_out, T_out] - attention_x_stack_tmp = tf.squeeze( - attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] - attention_x_stack.append(attention_x_stack_tmp) - - attention_h_stack = [] - for layer in range(len(attention_h)): - attention_h_stack_tmp = attention_h[layer].stack( - ) # [T_out, N, H, 1, T_out] - attention_h_stack_tmp = tf.transpose( - attention_h_stack_tmp, perm=[3, 1, 2, 0, - 4]) # [1, N, H, T_out, T_out] - attention_h_stack_tmp = tf.squeeze( - attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] - attention_h_stack.append(attention_h_stack_tmp) - - return outputs_stack, attention_x_stack, attention_h_stack, cache - - def _get_shape_invariants(self, tensor): - """Returns the shape of the tensor but sets middle dims to None.""" - if isinstance(tensor, tf.TensorArray): - shape = None - else: - shape = tensor.shape.as_list() - for i in range(1, len(shape) - 1): - shape[i] = None - return tf.TensorShape(shape) - - -class SelfAttentionDecoderOri(): - """Decoder using self-attention as described in - https://arxiv.org/abs/1706.03762. - """ - - def __init__(self, - num_layers, - num_units=512, - num_heads=8, - ffn_inner_dim=2048, - dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - position_encoder=SinusoidalPositionEncoder(), - self_attention_type='scaled_dot'): - """Initializes the parameters of the decoder. - - Args: - num_layers: The number of layers. - num_units: The number of hidden units. - num_heads: The number of heads in the multi-head attention. - ffn_inner_dim: The number of units of the inner linear transformation - in the feed forward layer. - dropout: The probability to drop units from the outputs. - attention_dropout: The probability to drop units from the attention. - relu_dropout: The probability to drop units from the ReLU activation in - the feed forward layer. - position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to - apply on inputs or ``None``. - self_attention_type: Type of self attention, "scaled_dot" or "average" (case - insensitive). - - Raises: - ValueError: if :obj:`self_attention_type` is invalid. - """ - super(SelfAttentionDecoderOri, self).__init__() - self.num_layers = num_layers - self.num_units = num_units - self.num_heads = num_heads - self.ffn_inner_dim = ffn_inner_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.relu_dropout = relu_dropout - self.position_encoder = position_encoder - self.self_attention_type = self_attention_type.lower() - if self.self_attention_type not in ('scaled_dot', 'average'): - raise ValueError('invalid attention type %s' - % self.self_attention_type) - if self.self_attention_type == 'average': - tf.logging.warning( - 'Support for average attention network is experimental ' - 'and may change in future versions.') - - @property - def output_size(self): - """Returns the decoder output size.""" - return self.num_units - - @property - def support_alignment_history(self): - return True - - @property - def support_multi_source(self): - return True - - def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): - cache = {} - - for layer in range(self.num_layers): - proj_cache_shape = [ - batch_size, self.num_heads, 0, self.num_units // self.num_heads - ] - layer_cache = {} - layer_cache['memory'] = [{ - 'memory_keys': - tf.zeros(proj_cache_shape, dtype=dtype), - 'memory_values': - tf.zeros(proj_cache_shape, dtype=dtype) - } for _ in range(num_sources)] - if self.self_attention_type == 'scaled_dot': - layer_cache['self_keys'] = tf.zeros( - proj_cache_shape, dtype=dtype) - layer_cache['self_values'] = tf.zeros( - proj_cache_shape, dtype=dtype) - elif self.self_attention_type == 'average': - layer_cache['prev_g'] = tf.zeros( - [batch_size, 1, self.num_units], dtype=dtype) - cache['layer_{}'.format(layer)] = layer_cache - - return cache - - def _self_attention_stack(self, - inputs, - sequence_length=None, - mode=True, - cache=None, - memory=None, - memory_sequence_length=None, - step=None): - inputs *= self.num_units**0.5 - if self.position_encoder is not None: - inputs = self.position_encoder( - inputs, position=step + 1 if step is not None else None) - - inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) - - decoder_mask = None - memory_mask = None - last_attention = None - - if self.self_attention_type == 'scaled_dot': - if sequence_length is not None: - decoder_mask = transformer.build_future_mask( - sequence_length, - num_heads=self.num_heads, - maximum_length=tf.shape(inputs)[1]) - elif self.self_attention_type == 'average': - if cache is None: - if sequence_length is None: - sequence_length = tf.fill([tf.shape(inputs)[0]], - tf.shape(inputs)[1]) - decoder_mask = transformer.cumulative_average_mask( - sequence_length, - maximum_length=tf.shape(inputs)[1], - dtype=inputs.dtype) - - if memory is not None and not tf.contrib.framework.nest.is_sequence( - memory): - memory = (memory, ) - if memory_sequence_length is not None: - if not tf.contrib.framework.nest.is_sequence( - memory_sequence_length): - memory_sequence_length = (memory_sequence_length, ) - memory_mask = [ - transformer.build_sequence_mask( - length, - num_heads=self.num_heads, - maximum_length=tf.shape(m)[1]) - for m, length in zip(memory, memory_sequence_length) - ] - - for layer in range(self.num_layers): - layer_name = 'layer_{}'.format(layer) - layer_cache = cache[layer_name] if cache is not None else None - with tf.variable_scope(layer_name): - if self.self_attention_type == 'scaled_dot': - with tf.variable_scope('masked_multi_head'): - encoded = transformer.multi_head_attention( - self.num_heads, - transformer.norm(inputs), - None, - mode, - num_units=self.num_units, - mask=decoder_mask, - cache=layer_cache, - dropout=self.attention_dropout) - last_context = transformer.drop_and_add( - inputs, encoded, mode, dropout=self.dropout) - elif self.self_attention_type == 'average': - with tf.variable_scope('average_attention'): - # Cumulative average. - x = transformer.norm(inputs) - y = transformer.cumulative_average( - x, - decoder_mask if cache is None else step, - cache=layer_cache) - # FFN. - y = transformer.feed_forward( - y, - self.ffn_inner_dim, - mode, - dropout=self.relu_dropout) - # Gating layer. - z = tf.layers.dense( - tf.concat([x, y], -1), self.num_units * 2) - i, f = tf.split(z, 2, axis=-1) - y = tf.sigmoid(i) * x + tf.sigmoid(f) * y - last_context = transformer.drop_and_add( - inputs, y, mode, dropout=self.dropout) - - if memory is not None: - for i, (mem, mask) in enumerate(zip(memory, memory_mask)): - memory_cache = layer_cache['memory'][i] if layer_cache is not None else None # yapf:disable - with tf.variable_scope('multi_head' if i - == 0 else 'multi_head_%d' % i): # yapf:disable - context, last_attention = transformer.multi_head_attention( - self.num_heads, - transformer.norm(last_context), - mem, - mode, - mask=mask, - cache=memory_cache, - dropout=self.attention_dropout, - return_attention=True) - last_context = transformer.drop_and_add( - last_context, - context, - mode, - dropout=self.dropout) - if i > 0: # Do not return attention in case of multi source. - last_attention = None - - with tf.variable_scope('ffn'): - transformed = transformer.feed_forward_ori( - transformer.norm(last_context), - self.ffn_inner_dim, - mode, - dropout=self.relu_dropout) - transformed = transformer.drop_and_add( - last_context, transformed, mode, dropout=self.dropout) - - inputs = transformed - - if last_attention is not None: - # The first head of the last layer is returned. - first_head_attention = last_attention[:, 0] - else: - first_head_attention = None - - outputs = transformer.norm(inputs) - return outputs, first_head_attention - - def decode_from_inputs(self, - inputs, - sequence_length, - initial_state=None, - mode=True, - memory=None, - memory_sequence_length=None): - outputs, attention = self._self_attention_stack( - inputs, - sequence_length=sequence_length, - mode=mode, - memory=memory, - memory_sequence_length=memory_sequence_length) - return outputs, None, attention - - def step_fn(self, - mode, - batch_size, - initial_state=None, - memory=None, - memory_sequence_length=None, - dtype=tf.float32): - if memory is None: - num_sources = 0 - elif tf.contrib.framework.nest.is_sequence(memory): - num_sources = len(memory) - else: - num_sources = 1 - cache = self._init_cache( - batch_size, dtype=dtype, num_sources=num_sources) - - def _fn(step, inputs, cache, mode): - inputs = tf.expand_dims(inputs, 1) - outputs, attention = self._self_attention_stack( - inputs, - mode=mode, - cache=cache, - memory=memory, - memory_sequence_length=memory_sequence_length, - step=step) - outputs = tf.squeeze(outputs, axis=1) - if attention is not None: - attention = tf.squeeze(attention, axis=1) - return outputs, cache, attention - - return _fn, cache diff --git a/modelscope/models/audio/tts/models/self_attention_encoder.py b/modelscope/models/audio/tts/models/self_attention_encoder.py deleted file mode 100755 index ce4193dc..00000000 --- a/modelscope/models/audio/tts/models/self_attention_encoder.py +++ /dev/null @@ -1,182 +0,0 @@ -"""Define the self-attention encoder.""" - -import tensorflow as tf - -from . import transformer -from .position import SinusoidalPositionEncoder - - -class SelfAttentionEncoder(): - """Encoder using self-attention as described in - https://arxiv.org/abs/1706.03762. - """ - - def __init__(self, - num_layers, - num_units=512, - num_heads=8, - ffn_inner_dim=2048, - dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - position_encoder=SinusoidalPositionEncoder()): - """Initializes the parameters of the encoder. - - Args: - num_layers: The number of layers. - num_units: The number of hidden units. - num_heads: The number of heads in the multi-head attention. - ffn_inner_dim: The number of units of the inner linear transformation - in the feed forward layer. - dropout: The probability to drop units from the outputs. - attention_dropout: The probability to drop units from the attention. - relu_dropout: The probability to drop units from the ReLU activation in - the feed forward layer. - position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to - apply on inputs or ``None``. - """ - super(SelfAttentionEncoder, self).__init__() - self.num_layers = num_layers - self.num_units = num_units - self.num_heads = num_heads - self.ffn_inner_dim = ffn_inner_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.relu_dropout = relu_dropout - self.position_encoder = position_encoder - - def encode(self, inputs, sequence_length=None, mode=True): - inputs *= self.num_units**0.5 - if self.position_encoder is not None: - inputs = self.position_encoder(inputs) - - inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) - mask = transformer.build_sequence_mask( - sequence_length, - num_heads=self.num_heads, - maximum_length=tf.shape(inputs)[1]) - - mask_FF = tf.squeeze( - transformer.build_sequence_mask( - sequence_length, maximum_length=tf.shape(inputs)[1]), - axis=1) - - state = () - - attns = [] - for layer in range(self.num_layers): - with tf.variable_scope('layer_{}'.format(layer)): - with tf.variable_scope('multi_head'): - context, attn = transformer.multi_head_attention( - self.num_heads, - transformer.norm(inputs), - None, - mode, - num_units=self.num_units, - mask=mask, - dropout=self.attention_dropout, - return_attention=True) - attns.append(attn) - context = transformer.drop_and_add( - inputs, context, mode, dropout=self.dropout) - - with tf.variable_scope('ffn'): - transformed = transformer.feed_forward( - transformer.norm(context), - self.ffn_inner_dim, - mode, - dropout=self.relu_dropout, - mask=mask_FF) - transformed = transformer.drop_and_add( - context, transformed, mode, dropout=self.dropout) - - inputs = transformed - state += (tf.reduce_mean(inputs, axis=1), ) - - outputs = transformer.norm(inputs) - return (outputs, state, sequence_length, attns) - - -class SelfAttentionEncoderOri(): - """Encoder using self-attention as described in - https://arxiv.org/abs/1706.03762. - """ - - def __init__(self, - num_layers, - num_units=512, - num_heads=8, - ffn_inner_dim=2048, - dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - position_encoder=SinusoidalPositionEncoder()): - """Initializes the parameters of the encoder. - - Args: - num_layers: The number of layers. - num_units: The number of hidden units. - num_heads: The number of heads in the multi-head attention. - ffn_inner_dim: The number of units of the inner linear transformation - in the feed forward layer. - dropout: The probability to drop units from the outputs. - attention_dropout: The probability to drop units from the attention. - relu_dropout: The probability to drop units from the ReLU activation in - the feed forward layer. - position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to - apply on inputs or ``None``. - """ - super(SelfAttentionEncoderOri, self).__init__() - self.num_layers = num_layers - self.num_units = num_units - self.num_heads = num_heads - self.ffn_inner_dim = ffn_inner_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.relu_dropout = relu_dropout - self.position_encoder = position_encoder - - def encode(self, inputs, sequence_length=None, mode=True): - inputs *= self.num_units**0.5 - if self.position_encoder is not None: - inputs = self.position_encoder(inputs) - - inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) - mask = transformer.build_sequence_mask( - sequence_length, - num_heads=self.num_heads, - maximum_length=tf.shape(inputs)[1]) # [N, 1, 1, T_out] - - state = () - - attns = [] - for layer in range(self.num_layers): - with tf.variable_scope('layer_{}'.format(layer)): - with tf.variable_scope('multi_head'): - context, attn = transformer.multi_head_attention( - self.num_heads, - transformer.norm(inputs), - None, - mode, - num_units=self.num_units, - mask=mask, - dropout=self.attention_dropout, - return_attention=True) - attns.append(attn) - context = transformer.drop_and_add( - inputs, context, mode, dropout=self.dropout) - - with tf.variable_scope('ffn'): - transformed = transformer.feed_forward_ori( - transformer.norm(context), - self.ffn_inner_dim, - mode, - dropout=self.relu_dropout) - transformed = transformer.drop_and_add( - context, transformed, mode, dropout=self.dropout) - - inputs = transformed - state += (tf.reduce_mean(inputs, axis=1), ) - - outputs = transformer.norm(inputs) - return (outputs, state, sequence_length, attns) diff --git a/modelscope/models/audio/tts/models/transformer.py b/modelscope/models/audio/tts/models/transformer.py deleted file mode 100755 index a9f0bedc..00000000 --- a/modelscope/models/audio/tts/models/transformer.py +++ /dev/null @@ -1,1157 +0,0 @@ -"""Define layers related to the Google's Transformer model.""" - -import tensorflow as tf - -from . import compat, fsmn - - -def tile_sequence_length(sequence_length, num_heads): - """Tiles lengths :obj:`num_heads` times. - - Args: - sequence_length: The sequence length. - num_heads: The number of heads. - - Returns: - A ``tf.Tensor`` where each length is replicated :obj:`num_heads` times. - """ - sequence_length = tf.tile(sequence_length, [num_heads]) - sequence_length = tf.reshape(sequence_length, [num_heads, -1]) - sequence_length = tf.transpose(sequence_length, perm=[1, 0]) - sequence_length = tf.reshape(sequence_length, [-1]) - return sequence_length - - -def build_sequence_mask(sequence_length, - num_heads=None, - maximum_length=None, - dtype=tf.float32): - """Builds the dot product mask. - - Args: - sequence_length: The sequence length. - num_heads: The number of heads. - maximum_length: Optional size of the returned time dimension. Otherwise - it is the maximum of :obj:`sequence_length`. - dtype: The type of the mask tensor. - - Returns: - A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape - ``[batch_size, 1, 1, max_length]``. - """ - mask = tf.sequence_mask( - sequence_length, maxlen=maximum_length, dtype=dtype) - mask = tf.expand_dims(mask, axis=1) - if num_heads is not None: - mask = tf.expand_dims(mask, axis=1) - return mask - - -def build_sequence_mask_window(sequence_length, - left_window_size=-1, - right_window_size=-1, - num_heads=None, - maximum_length=None, - dtype=tf.float32): - """Builds the dot product mask. - - Args: - sequence_length: The sequence length. - num_heads: The number of heads. - maximum_length: Optional size of the returned time dimension. Otherwise - it is the maximum of :obj:`sequence_length`. - dtype: The type of the mask tensor. - - Returns: - A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape - ``[batch_size, 1, 1, max_length]``. - """ - sequence_mask = tf.sequence_mask( - sequence_length, maxlen=maximum_length, dtype=dtype) - mask = _window_mask( - sequence_length, - left_window_size=left_window_size, - right_window_size=right_window_size, - maximum_length=maximum_length, - dtype=dtype) - mask *= tf.expand_dims(sequence_mask, axis=1) - if num_heads is not None: - mask = tf.expand_dims(mask, axis=1) - return mask - - -def _lower_triangle_mask(sequence_length, - maximum_length=None, - dtype=tf.float32, - band=-1): - batch_size = tf.shape(sequence_length)[0] - if maximum_length is None: - maximum_length = tf.reduce_max(sequence_length) - mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype) - mask = compat.tf_compat( - v2='linalg.band_part', v1='matrix_band_part')(mask, band, 0) - return mask - - -def _higher_triangle_mask(sequence_length, - maximum_length=None, - dtype=tf.float32, - band=-1): - batch_size = tf.shape(sequence_length)[0] - if maximum_length is None: - maximum_length = tf.reduce_max(sequence_length) - mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype) - mask = compat.tf_compat( - v2='linalg.band_part', v1='matrix_band_part')(mask, 0, band) - return mask - - -def _window_mask(sequence_length, - left_window_size=-1, - right_window_size=-1, - maximum_length=None, - dtype=tf.float32): - batch_size = tf.shape(sequence_length)[0] - if maximum_length is None: - maximum_length = tf.reduce_max(sequence_length) - mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype) - left_window_size = tf.minimum( - tf.cast(left_window_size, tf.int64), - tf.cast(maximum_length - 1, tf.int64)) - right_window_size = tf.minimum( - tf.cast(right_window_size, tf.int64), - tf.cast(maximum_length - 1, tf.int64)) - mask = tf.matrix_band_part(mask, left_window_size, right_window_size) - return mask - - -def build_future_mask(sequence_length, - num_heads=None, - maximum_length=None, - dtype=tf.float32, - band=-1): - """Builds the dot product mask for future positions. - - Args: - sequence_length: The sequence length. - num_heads: The number of heads. - maximum_length: Optional size of the returned time dimension. Otherwise - it is the maximum of :obj:`sequence_length`. - dtype: The type of the mask tensor. - - Returns: - A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape - ``[batch_size, 1, max_length, max_length]``. - """ - sequence_mask = tf.sequence_mask( - sequence_length, maxlen=maximum_length, dtype=dtype) - mask = _lower_triangle_mask( - sequence_length, maximum_length=maximum_length, dtype=dtype, band=band) - mask *= tf.expand_dims(sequence_mask, axis=1) - if num_heads is not None: - mask = tf.expand_dims(mask, axis=1) - return mask - - -def build_history_mask(sequence_length, - num_heads=None, - maximum_length=None, - dtype=tf.float32, - band=-1): - """Builds the dot product mask for future positions. - - Args: - sequence_length: The sequence length. - num_heads: The number of heads. - maximum_length: Optional size of the returned time dimension. Otherwise - it is the maximum of :obj:`sequence_length`. - dtype: The type of the mask tensor. - - Returns: - A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape - ``[batch_size, 1, max_length, max_length]``. - """ - sequence_mask = tf.sequence_mask( - sequence_length, maxlen=maximum_length, dtype=dtype) - mask = _higher_triangle_mask( - sequence_length, maximum_length=maximum_length, dtype=dtype, band=band) - mask *= tf.expand_dims(sequence_mask, axis=1) - if num_heads is not None: - mask = tf.expand_dims(mask, axis=1) - return mask - - -def cumulative_average_mask(sequence_length, - maximum_length=None, - dtype=tf.float32): - """Builds the mask to compute the cumulative average as described in - https://arxiv.org/abs/1805.00631. - - Args: - sequence_length: The sequence length. - maximum_length: Optional size of the returned time dimension. Otherwise - it is the maximum of :obj:`sequence_length`. - dtype: The type of the mask tensor. - - Returns: - A ``tf.Tensor`` of type :obj:`dtype` and shape - ``[batch_size, max_length, max_length]``. - """ - sequence_mask = tf.sequence_mask( - sequence_length, maxlen=maximum_length, dtype=dtype) - mask = _lower_triangle_mask( - sequence_length, maximum_length=maximum_length, dtype=dtype) - mask *= tf.expand_dims(sequence_mask, axis=2) - weight = tf.range(1, tf.cast(tf.shape(mask)[1] + 1, dtype), dtype=dtype) - mask /= tf.expand_dims(weight, 1) - return mask - - -def cumulative_average(inputs, mask_or_step, cache=None): - """Computes the cumulative average as described in - https://arxiv.org/abs/1805.00631. - - Args: - inputs: The sequence to average. A tensor of shape :math:`[B, T, D]`. - mask_or_step: If :obj:`cache` is set, this is assumed to be the current step - of the dynamic decoding. Otherwise, it is the mask matrix used to compute - the cumulative average. - cache: A dictionnary containing the cumulative average of the previous step. - - Returns: - The cumulative average, a tensor of the same shape and type as :obj:`inputs`. - """ - if cache is not None: - step = tf.cast(mask_or_step, inputs.dtype) - aa = (inputs + step * cache['prev_g']) / (step + 1.0) - cache['prev_g'] = aa - return aa - else: - mask = mask_or_step - return tf.matmul(mask, inputs) - - -def fused_projection(inputs, num_units, num_outputs=1): - """Projects the same input into multiple output spaces. - - Args: - inputs: The inputs to project. - num_units: The number of output units of each space. - num_outputs: The number of output spaces. - - Returns: - :obj:`num_outputs` ``tf.Tensor`` of depth :obj:`num_units`. - """ - return tf.split( - tf.layers.conv1d(inputs, num_units * num_outputs, 1), - num_outputs, - axis=2) - - -def split_heads(inputs, num_heads): - """Splits a tensor in depth. - - Args: - inputs: A ``tf.Tensor`` of shape :math:`[B, T, D]`. - num_heads: The number of heads :math:`H`. - - Returns: - A ``tf.Tensor`` of shape :math:`[B, H, T, D / H]`. - """ - static_shape = inputs.get_shape().as_list() - depth = static_shape[-1] - outputs = tf.reshape(inputs, [ - tf.shape(inputs)[0], - tf.shape(inputs)[1], num_heads, depth // num_heads - ]) - outputs = tf.transpose(outputs, perm=[0, 2, 1, 3]) - return outputs - - -def combine_heads(inputs): - """Concatenates heads. - - Args: - inputs: A ``tf.Tensor`` of shape :math:`[B, H, T, D]`. - - Returns: - A ``tf.Tensor`` of shape :math:`[B, T, D * H]`. - """ - static_shape = inputs.get_shape().as_list() - depth = static_shape[-1] - num_heads = static_shape[1] - outputs = tf.transpose(inputs, perm=[0, 2, 1, 3]) - outputs = tf.reshape( - outputs, - [tf.shape(outputs)[0], - tf.shape(outputs)[1], depth * num_heads]) - return outputs - - -def dot_product_attention(queries, keys, values, mode, mask=None, dropout=0.0): - """Computes the dot product attention. - - Args: - queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. - keys: The sequence use to calculate attention scores. A tensor of shape - :math:`[B, T_2, ...]`. - values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. - mode: A ``tf.estimator.ModeKeys`` mode. - mask: A ``tf.Tensor`` applied to the dot product. - dropout: The probability to drop units from the inputs. - - Returns: - A tuple ``(context vector, attention vector)``. - """ - dot = tf.matmul(queries, keys, transpose_b=True) - - if mask is not None: - dot = tf.cast( - tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min), - dot.dtype) - - softmax = tf.nn.softmax(tf.cast(dot, tf.float32)) - attn = tf.cast(softmax, dot.dtype) - drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode) - - context = tf.matmul(drop_attn, values) - - return context, attn - - -def dot_product_attention_wpa(num_heads, - queries, - keys, - values, - mode, - attention_left_window=-1, - attention_right_window=0, - mask=None, - max_id_cache=None, - mono=False, - peak_delay=-1, - dropout=0.0): - """ - Computes the dot product attention. - Args: - queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. - keys: The sequence use to calculate attention scores. A tensor of shape - :math:`[B, T_2, ...]`. - values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. - mode: A ``tf.estimator.ModeKeys`` mode. - mask: A ``tf.Tensor`` applied to the dot product. - dropout: The probability to drop units from the inputs. - - Returns: - A tuple ``(context vector, attention vector)``. - """ - # Dot product between queries and keys. - dot = tf.matmul(queries, keys, transpose_b=True) - depth = tf.shape(dot)[-1] - if mask is not None: - dot = tf.cast( - tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min), - dot.dtype) - # wpa - max_id = tf.math.argmax(input=dot, axis=-1) - # peak delay - if peak_delay > 0: - if max_id_cache is not None: - M = tf.cast(max_id_cache['pre_max_id'], dtype=max_id.dtype) - inputs_len = tf.math.minimum( - M + peak_delay, tf.cast(depth - 1, dtype=max_id.dtype)) - delay_mask = tf.sequence_mask( - inputs_len, maxlen=depth, dtype=tf.float32) - dot = tf.cast( - tf.cast(dot, tf.float32) * delay_mask - + ((1.0 - delay_mask) * tf.float32.min), dot.dtype) # yapf:disable - max_id = tf.math.argmax(input=dot, axis=-1) - # mono - if mono: - if max_id_cache is None: - d = tf.shape(max_id)[-1] - tmp_max_id = tf.reshape(max_id, [-1, num_heads, d]) - tmp_max_id = tf.slice( - tmp_max_id, [0, 0, 0], - [tf.shape(tmp_max_id)[0], - tf.shape(tmp_max_id)[1], d - 1]) - zeros = tf.zeros( - shape=(tf.shape(tmp_max_id)[0], tf.shape(tmp_max_id)[1], 1), - dtype=max_id.dtype) - tmp_max_id = tf.concat([zeros, tmp_max_id], axis=-1) - mask1 = tf.sequence_mask( - tmp_max_id, maxlen=depth, dtype=tf.float32) - dot = tf.cast( - tf.cast(dot, tf.float32) - * (1.0 - mask1) + mask1 * tf.float32.min, dot.dtype) # yapf:disable - max_id = tf.math.argmax(input=dot, axis=-1) - else: - # eval - tmp_max_id = tf.reshape(max_id, [-1, num_heads, 1]) - max_id_cache['pre_max_id'] = tmp_max_id - # right_mask - right_offset = tf.constant(attention_right_window, dtype=max_id.dtype) - right_len = tf.math.minimum(max_id + right_offset, - tf.cast(depth - 1, dtype=max_id.dtype)) - right_mask = tf.sequence_mask(right_len, maxlen=depth, dtype=tf.float32) - dot = tf.cast( - tf.cast(dot, tf.float32) * right_mask - + ((1.0 - right_mask) * tf.float32.min), dot.dtype) # yapf:disable - # left_mask - if attention_left_window > 0: - left_offset = tf.constant(attention_left_window, dtype=max_id.dtype) - left_len = tf.math.maximum(max_id - left_offset, - tf.cast(0, dtype=max_id.dtype)) - left_mask = tf.sequence_mask(left_len, maxlen=depth, dtype=tf.float32) - dot = tf.cast( - tf.cast(dot, tf.float32) * (1.0 - left_mask) - + (left_mask * tf.float32.min), dot.dtype) # yapf:disable - # Compute attention weights. - attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype) - drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode) - - # Compute attention context. - context = tf.matmul(drop_attn, values) - - return context, attn - - -def multi_head_attention(num_heads, - queries, - memory, - mode, - num_units=None, - mask=None, - cache=None, - dropout=0.0, - return_attention=False): - """Computes the multi-head attention as described in - https://arxiv.org/abs/1706.03762. - - Args: - num_heads: The number of attention heads. - queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. - memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. - If ``None``, computes self-attention. - mode: A ``tf.estimator.ModeKeys`` mode. - num_units: The number of hidden units. If not set, it is set to the input - dimension. - mask: A ``tf.Tensor`` applied to the dot product. - cache: A dictionary containing pre-projected keys and values. - dropout: The probability to drop units from the inputs. - return_attention: Return the attention head probabilities in addition to the - context. - - Returns: - The concatenated attention context of each head and the attention - probabilities (if :obj:`return_attention` is set). - """ - num_units = num_units or queries.get_shape().as_list()[-1] - - if num_units % num_heads != 0: - raise ValueError('Multi head attention requires that num_units is a' - ' multiple of {}'.format(num_heads)) - - if memory is None: - queries, keys, values = fused_projection( - queries, num_units, num_outputs=3) - - keys = split_heads(keys, num_heads) - values = split_heads(values, num_heads) - - if cache is not None: - keys = tf.concat([cache['self_keys'], keys], axis=2) - values = tf.concat([cache['self_values'], values], axis=2) - cache['self_keys'] = keys - cache['self_values'] = values - else: - queries = tf.layers.conv1d(queries, num_units, 1) - - if cache is not None: - - def _project_and_split(): - k, v = fused_projection(memory, num_units, num_outputs=2) - return split_heads(k, num_heads), split_heads(v, num_heads) - - keys, values = tf.cond( - tf.equal(tf.shape(cache['memory_keys'])[2], 0), - true_fn=_project_and_split, - false_fn=lambda: - (cache['memory_keys'], cache['memory_values'])) - cache['memory_keys'] = keys - cache['memory_values'] = values - else: - keys, values = fused_projection(memory, num_units, num_outputs=2) - keys = split_heads(keys, num_heads) - values = split_heads(values, num_heads) - - queries = split_heads(queries, num_heads) - queries *= (num_units // num_heads)**-0.5 - - heads, attn = dot_product_attention( - queries, keys, values, mode, mask=mask, dropout=dropout) - - # Concatenate all heads output. - combined = combine_heads(heads) - outputs = tf.layers.conv1d(combined, num_units, 1) - - if not return_attention: - return outputs - return outputs, attn - - -def multi_head_attention_PNCA(num_heads, - queries, - memory, - mode, - num_units=None, - mask=None, - mask_h=None, - cache=None, - cache_h=None, - dropout=0.0, - return_attention=False, - X_band_width=None, - layer_name='multi_head'): - """Computes the multi-head attention as described in - https://arxiv.org/abs/1706.03762. - - Args: - num_heads: The number of attention heads. - queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. - memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. - If ``None``, computes self-attention. - mode: A ``tf.estimator.ModeKeys`` mode. - num_units: The number of hidden units. If not set, it is set to the input - dimension. - mask: A ``tf.Tensor`` applied to the dot product. - cache: A dictionary containing pre-projected keys and values. - dropout: The probability to drop units from the inputs. - return_attention: Return the attention head probabilities in addition to the - context. - - Returns: - The concatenated attention context of each head and the attention - probabilities (if :obj:`return_attention` is set). - """ - num_units = num_units or queries.get_shape().as_list()[-1] - - if num_units % num_heads != 0: - raise ValueError('Multi head attention requires that num_units is a' - ' multiple of {}'.format(num_heads)) - - # X - queries, keys, values = fused_projection(queries, num_units, num_outputs=3) - - keys = split_heads(keys, num_heads) - values = split_heads(values, num_heads) - - if cache is not None: - keys = tf.concat([cache['self_keys'], keys], axis=2) - values = tf.concat([cache['self_values'], values], axis=2) - if X_band_width is not None: - keys_band = tf.cond( - tf.less(X_band_width, 0), lambda: keys, lambda: tf.cond( - tf.less(tf.shape(keys)[2], X_band_width), lambda: keys, - lambda: keys[:, :, -X_band_width:, :]) - ) # not support X_band_width == 0 - values_band = tf.cond( - tf.less(X_band_width, 0), lambda: values, lambda: tf.cond( - tf.less(tf.shape(values)[2], X_band_width), lambda: values, - lambda: values[:, :, -X_band_width:, :])) - cache['self_keys'] = keys_band - cache['self_values'] = values_band - else: - cache['self_keys'] = keys - cache['self_values'] = values - - queries = split_heads(queries, num_heads) - queries *= (num_units // num_heads)**-0.5 - - heads, attn = dot_product_attention( - queries, keys, values, mode, mask=mask, dropout=dropout) - - # Concatenate all heads output. - combined = combine_heads(heads) - outputs = tf.layers.conv1d(combined, num_units, 1) - - # H - if cache_h is not None: - - def _project_and_split(): - k, v = fused_projection(memory, num_units, num_outputs=2) - return split_heads(k, num_heads), split_heads(v, num_heads) - - keys_h, values_h = tf.cond( - tf.equal(tf.shape(cache_h['memory_keys'])[2], 0), - true_fn=_project_and_split, - false_fn=lambda: - (cache_h['memory_keys'], cache_h['memory_values'])) - cache_h['memory_keys'] = keys_h - cache_h['memory_values'] = values_h - else: - keys_h, values_h = fused_projection(memory, num_units, num_outputs=2) - keys_h = split_heads(keys_h, num_heads) - values_h = split_heads(values_h, num_heads) - - heads_h, attn_h = dot_product_attention( - queries, keys_h, values_h, mode, mask=mask_h, dropout=dropout) - - # Concatenate all heads output. - combined_h = combine_heads(heads_h) - outputs_h = tf.layers.conv1d(combined_h, num_units, 1) - - # ADD - outputs = outputs + outputs_h - - # RETURN - return outputs, attn, attn_h - - -def multi_head_attention_memory(num_heads, - queries, - memory, - mode, - num_memory=None, - num_units=None, - mask=None, - cache=None, - dropout=0.0, - return_attention=False): - """Computes the multi-head attention as described in - https://arxiv.org/abs/1706.03762. - - Args: - num_heads: The number of attention heads. - queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. - memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. - If ``None``, computes self-attention. - mode: A ``tf.estimator.ModeKeys`` mode. - num_units: The number of hidden units. If not set, it is set to the input - dimension. - mask: A ``tf.Tensor`` applied to the dot product. - cache: A dictionary containing pre-projected keys and values. - dropout: The probability to drop units from the inputs. - return_attention: Return the attention head probabilities in addition to the - context. - - Returns: - The concatenated attention context of each head and the attention - probabilities (if :obj:`return_attention` is set). - """ - num_units = num_units or queries.get_shape().as_list()[-1] - - if num_units % num_heads != 0: - raise ValueError('Multi head attention requires that num_units is a' - ' multiple of {}'.format(num_heads)) - - # PERSISTENT MEMORY - # key memory - if num_memory is not None: - key_m = tf.get_variable( - 'key_m', - shape=[num_memory, num_units], - initializer=tf.glorot_uniform_initializer(), - dtype=tf.float32) - # value memory - value_m = tf.get_variable( - 'value_m', - shape=[num_memory, num_units], - initializer=tf.glorot_uniform_initializer(), - dtype=tf.float32) - if memory is None: - queries, keys, values = fused_projection( - queries, num_units, num_outputs=3) - - # concat memory - if num_memory is not None: - key_m_expand = tf.tile( - tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1]) - value_m_expand = tf.tile( - tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1]) - keys = tf.concat([key_m_expand, keys], axis=1) - values = tf.concat([value_m_expand, values], axis=1) - - keys = split_heads(keys, num_heads) - values = split_heads(values, num_heads) - - if cache is not None: - keys = tf.concat([cache['self_keys'], keys], axis=2) - values = tf.concat([cache['self_values'], values], axis=2) - cache['self_keys'] = keys - cache['self_values'] = values - else: - queries = tf.layers.conv1d(queries, num_units, 1) - - if cache is not None: - - def _project_and_split(): - k, v = fused_projection(memory, num_units, num_outputs=2) - return split_heads(k, num_heads), split_heads(v, num_heads) - - keys, values = tf.cond( - tf.equal(tf.shape(cache['memory_keys'])[2], 0), - true_fn=_project_and_split, - false_fn=lambda: - (cache['memory_keys'], cache['memory_values'])) - cache['memory_keys'] = keys - cache['memory_values'] = values - else: - keys, values = fused_projection(memory, num_units, num_outputs=2) - keys = split_heads(keys, num_heads) - values = split_heads(values, num_heads) - - queries = split_heads(queries, num_heads) - queries *= (num_units // num_heads)**-0.5 - - heads, attn = dot_product_attention( - queries, keys, values, mode, mask=mask, dropout=dropout) - - # Concatenate all heads output. - combined = combine_heads(heads) - outputs = tf.layers.conv1d(combined, num_units, 1) - - if not return_attention: - return outputs - return outputs, attn - - -def Ci_Cd_Memory(num_heads, - queries, - mode, - filter_size=None, - num_memory=None, - num_units=None, - fsmn_mask=None, - san_mask=None, - cache=None, - shift=None, - dropout=0.0, - return_attention=False): - """ - Args: - num_heads: The number of attention heads. - queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. - memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. - If ``None``, computes self-attention. - mode: A ``tf.estimator.ModeKeys`` mode. - num_units: The number of hidden units. If not set, it is set to the input - dimension. - mask: A ``tf.Tensor`` applied to the dot product. - cache: A dictionary containing pre-projected keys and values. - dropout: The probability to drop units from the inputs. - return_attention: Return the attention head probabilities in addition to the - context. - - Returns: - The concatenated attention context of each head and the attention - probabilities (if :obj:`return_attention` is set). - """ - num_units = num_units or queries.get_shape().as_list()[-1] - - if num_units % num_heads != 0: - raise ValueError('Multi head attention requires that num_units is a' - ' multiple of {}'.format(num_heads)) - # PERSISTENT MEMORY - if num_memory is not None: - key_m = tf.get_variable( - 'key_m', - shape=[num_memory, num_units], - initializer=tf.glorot_uniform_initializer(), - dtype=tf.float32) - value_m = tf.get_variable( - 'value_m', - shape=[num_memory, num_units], - initializer=tf.glorot_uniform_initializer(), - dtype=tf.float32) - - queries, keys, values = fused_projection(queries, num_units, num_outputs=3) - # fsmn memory block - if shift is not None: - # encoder - fsmn_memory = fsmn.MemoryBlockV2( - values, - filter_size, - mode, - shift=shift, - mask=fsmn_mask, - dropout=dropout) - else: - # decoder - fsmn_memory = fsmn.UniMemoryBlock( - values, - filter_size, - mode, - cache=cache, - mask=fsmn_mask, - dropout=dropout) - - # concat persistent memory - if num_memory is not None: - key_m_expand = tf.tile( - tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1]) - value_m_expand = tf.tile( - tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1]) - keys = tf.concat([key_m_expand, keys], axis=1) - values = tf.concat([value_m_expand, values], axis=1) - - keys = split_heads(keys, num_heads) - values = split_heads(values, num_heads) - - if cache is not None: - keys = tf.concat([cache['self_keys'], keys], axis=2) - values = tf.concat([cache['self_values'], values], axis=2) - cache['self_keys'] = keys - cache['self_values'] = values - - queries = split_heads(queries, num_heads) - queries *= (num_units // num_heads)**-0.5 - - heads, attn = dot_product_attention( - queries, keys, values, mode, mask=san_mask, dropout=dropout) - - # Concatenate all heads output. - combined = combine_heads(heads) - outputs = tf.layers.conv1d(combined, num_units, 1) - outputs = outputs + fsmn_memory - - if not return_attention: - return outputs - return outputs, attn - - -def multi_head_attention_wpa(num_heads, - queries, - memory, - mode, - attention_left_window=-1, - attention_right_window=0, - num_units=None, - mask=None, - cache=None, - max_id_cache=None, - dropout=0.0, - mono=False, - peak_delay=-1, - return_attention=False): - """Computes the multi-head attention as described in - https://arxiv.org/abs/1706.03762. - - Args: - num_heads: The number of attention heads. - queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. - memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. - If ``None``, computes self-attention. - mode: A ``tf.estimator.ModeKeys`` mode. - num_units: The number of hidden units. If not set, it is set to the input - dimension. - mask: A ``tf.Tensor`` applied to the dot product. - cache: A dictionary containing pre-projected keys and values. - dropout: The probability to drop units from the inputs. - return_attention: Return the attention head probabilities in addition to the - context. - - Returns: - The concatenated attention context of each head and the attention - probabilities (if :obj:`return_attention` is set). - """ - num_units = num_units or queries.get_shape().as_list()[-1] - - if num_units % num_heads != 0: - raise ValueError('Multi head attention requires that num_units is a' - ' multiple of {}'.format(num_heads)) - - if memory is None: - queries, keys, values = fused_projection( - queries, num_units, num_outputs=3) - - keys = split_heads(keys, num_heads) - values = split_heads(values, num_heads) - - if cache is not None: - keys = tf.concat([cache['self_keys'], keys], axis=2) - values = tf.concat([cache['self_values'], values], axis=2) - cache['self_keys'] = keys - cache['self_values'] = values - else: - queries = tf.layers.conv1d(queries, num_units, 1) - - if cache is not None: - - def _project_and_split(): - k, v = fused_projection(memory, num_units, num_outputs=2) - return split_heads(k, num_heads), split_heads(v, num_heads) - - keys, values = tf.cond( - tf.equal(tf.shape(cache['memory_keys'])[2], 0), - true_fn=_project_and_split, - false_fn=lambda: - (cache['memory_keys'], cache['memory_values'])) - cache['memory_keys'] = keys - cache['memory_values'] = values - else: - keys, values = fused_projection(memory, num_units, num_outputs=2) - keys = split_heads(keys, num_heads) - values = split_heads(values, num_heads) - - queries = split_heads(queries, num_heads) - queries *= (num_units // num_heads)**-0.5 - - heads, attn = dot_product_attention_wpa( - num_heads, - queries, - keys, - values, - mode, - attention_left_window=attention_left_window, - attention_right_window=attention_right_window, - mask=mask, - max_id_cache=max_id_cache, - mono=mono, - peak_delay=peak_delay, - dropout=dropout) - - # Concatenate all heads output. - combined = combine_heads(heads) - outputs = tf.layers.conv1d(combined, num_units, 1) - - if not return_attention: - return outputs - return outputs, attn - - -def feed_forward(x, inner_dim, mode, dropout=0.0, mask=None): - """Implements the Transformer's "Feed Forward" layer. - - .. math:: - - ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2 - - Args: - x: The input. - inner_dim: The number of units of the inner linear transformation. - mode: A ``tf.estimator.ModeKeys`` mode. - dropout: The probability to drop units from the inner transformation. - - Returns: - The transformed input. - """ - input_dim = x.get_shape().as_list()[-1] - - if mask is not None: - x = x * tf.expand_dims(mask, -1) - - inner = tf.layers.conv1d( - x, inner_dim, 3, padding='same', activation=tf.nn.relu) - - if mask is not None: - inner = inner * tf.expand_dims(mask, -1) - inner = tf.layers.dropout(inner, rate=dropout, training=mode) - outer = tf.layers.conv1d(inner, input_dim, 1) - - return outer - - -def feed_forward_ori(x, inner_dim, mode, dropout=0.0): - """Implements the Transformer's "Feed Forward" layer. - - .. math:: - - ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2 - - Args: - x: The input. - inner_dim: The number of units of the inner linear transformation. - mode: A ``tf.estimator.ModeKeys`` mode. - dropout: The probability to drop units from the inner transformation. - - Returns: - The transformed input. - """ - input_dim = x.get_shape().as_list()[-1] - - inner = tf.layers.conv1d(x, inner_dim, 1, activation=tf.nn.relu) - inner = tf.layers.dropout(inner, rate=dropout, training=mode) - outer = tf.layers.conv1d(inner, input_dim, 1) - - return outer - - -def norm(inputs): - """Layer normalizes :obj:`inputs`.""" - return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1) - - -def drop_and_add(inputs, outputs, mode, dropout=0.1): - """Drops units in the outputs and adds the previous values. - - Args: - inputs: The input of the previous layer. - outputs: The output of the previous layer. - mode: A ``tf.estimator.ModeKeys`` mode. - dropout: The probability to drop units in :obj:`outputs`. - - Returns: - The residual and normalized output. - """ - outputs = tf.layers.dropout(outputs, rate=dropout, training=mode) - - input_dim = inputs.get_shape().as_list()[-1] - output_dim = outputs.get_shape().as_list()[-1] - - if input_dim == output_dim: - outputs += inputs - return outputs - - -class FeedForwardNetwork(tf.keras.layers.Layer): - """Implements the Transformer's "Feed Forward" layer. - - .. math:: - - ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2 - - Note: - Object-oriented implementation for TensorFlow 2.0. - """ - - def __init__(self, - inner_dim, - output_dim, - dropout=0.1, - activation=tf.nn.relu, - **kwargs): - """Initializes this layer. - - Args: - inner_dim: The number of units of the inner linear transformation. - output_dim: The number of units of the ouput linear transformation. - dropout: The probability to drop units from the activation output. - activation: The activation function to apply between the two linear - transformations. - kwargs: Additional layer arguments. - """ - super(FeedForwardNetwork, self).__init__(**kwargs) - self.inner = tf.keras.layers.Dense( - inner_dim, activation=activation, name='inner') - self.outer = tf.keras.layers.Dense(output_dim, name='outer') - self.dropout = dropout - - def call(self, inputs, training=None): # pylint: disable=arguments-differ - """Runs the layer.""" - inner = self.inner(inputs) - inner = tf.layers.dropout(inner, self.dropout, training=training) - return self.outer(inner) - - -class MultiHeadAttention(tf.keras.layers.Layer): - """Computes the multi-head attention as described in - https://arxiv.org/abs/1706.03762. - - Note: - Object-oriented implementation for TensorFlow 2.0. - """ - - def __init__(self, - num_heads, - num_units, - dropout=0.1, - return_attention=False, - **kwargs): - """Initializes this layers. - - Args: - num_heads: The number of attention heads. - num_units: The number of hidden units. - dropout: The probability to drop units from the inputs. - return_attention: If ``True``, also return the attention weights of the - first head. - kwargs: Additional layer arguments. - """ - super(MultiHeadAttention, self).__init__(**kwargs) - if num_units % num_heads != 0: - raise ValueError( - 'Multi head attention requires that num_units is a' - ' multiple of %s' % num_heads) - self.num_heads = num_heads - self.num_units = num_units - self.linear_queries = tf.keras.layers.Dense( - num_units, name='linear_queries') - self.linear_keys = tf.keras.layers.Dense(num_units, name='linear_keys') - self.linear_values = tf.keras.layers.Dense( - num_units, name='linear_values') - self.linear_output = tf.keras.layers.Dense( - num_units, name='linear_output') - self.dropout = dropout - self.return_attention = return_attention - - def call(self, inputs, memory=None, mask=None, cache=None, training=None): # pylint: disable=arguments-differ - """Runs the layer. - - Args: - inputs: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. - memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. - If ``None``, computes self-attention. - mask: A ``tf.Tensor`` applied to the dot product. - cache: A dictionary containing pre-projected keys and values. - training: Run in training mode. - - Returns: - A tuple with the attention context, the updated cache and the attention - probabilities of the first head (if :obj:`return_attention` is ``True``). - """ - - def _compute_kv(x): - keys = self.linear_keys(x) - keys = split_heads(keys, self.num_heads) - values = self.linear_values(x) - values = split_heads(values, self.num_heads) - return keys, values - - # Compute queries. - queries = self.linear_queries(inputs) - queries = split_heads(queries, self.num_heads) - queries *= (self.num_units // self.num_heads)**-0.5 - - # Compute keys and values. - if memory is None: - keys, values = _compute_kv(inputs) - if cache: - keys = tf.concat([cache[0], keys], axis=2) - values = tf.concat([cache[1], values], axis=2) - else: - if cache: - if not self.linear_keys.built: - # Ensure that the variable names are not impacted by the tf.cond name - # scope if the layers have not already been built. - with tf.name_scope(self.linear_keys.name): - self.linear_keys.build(memory.shape) - with tf.name_scope(self.linear_values.name): - self.linear_values.build(memory.shape) - keys, values = tf.cond( - tf.equal(tf.shape(cache[0])[2], 0), - true_fn=lambda: _compute_kv(memory), - false_fn=lambda: cache) - else: - keys, values = _compute_kv(memory) - - cache = (keys, values) - - # Dot product attention. - dot = tf.matmul(queries, keys, transpose_b=True) - if mask is not None: - mask = tf.expand_dims(tf.cast(mask, tf.float32), - 1) # Broadcast on heads dimension. - dot = tf.cast( - tf.cast(dot, tf.float32) * mask - + ((1.0 - mask) * tf.float32.min), dot.dtype) # yapf:disable - attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype) - drop_attn = tf.layers.dropout(attn, self.dropout, training=training) - heads = tf.matmul(drop_attn, values) - - # Concatenate all heads output. - combined = combine_heads(heads) - outputs = self.linear_output(combined) - if self.return_attention: - return outputs, cache, attn - return outputs, cache diff --git a/modelscope/models/audio/tts/models/utils.py b/modelscope/models/audio/tts/models/utils.py deleted file mode 100755 index 03e1ef8c..00000000 --- a/modelscope/models/audio/tts/models/utils.py +++ /dev/null @@ -1,59 +0,0 @@ -import glob -import os - -import matplotlib -import matplotlib.pylab as plt -import torch -from torch.nn.utils import weight_norm - -matplotlib.use('Agg') - - -def plot_spectrogram(spectrogram): - fig, ax = plt.subplots(figsize=(10, 2)) - im = ax.imshow( - spectrogram, aspect='auto', origin='lower', interpolation='none') - plt.colorbar(im, ax=ax) - - fig.canvas.draw() - plt.close() - - return fig - - -def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find('Conv') != -1: - m.weight.data.normal_(mean, std) - - -def apply_weight_norm(m): - classname = m.__class__.__name__ - if classname.find('Conv') != -1: - weight_norm(m) - - -def get_padding(kernel_size, dilation=1): - return int((kernel_size * dilation - dilation) / 2) - - -def load_checkpoint(filepath, device): - assert os.path.isfile(filepath) - print("Loading '{}'".format(filepath)) - checkpoint_dict = torch.load(filepath, map_location=device) - print('Complete.') - return checkpoint_dict - - -def save_checkpoint(filepath, obj): - print('Saving checkpoint to {}'.format(filepath)) - torch.save(obj, filepath) - print('Complete.') - - -def scan_checkpoint(cp_dir, prefix): - pattern = os.path.join(cp_dir, prefix + '????????') - cp_list = glob.glob(pattern) - if len(cp_list) == 0: - return None - return sorted(cp_list)[-1] diff --git a/modelscope/models/audio/tts/models/utils/__init__.py b/modelscope/models/audio/tts/models/utils/__init__.py new file mode 100644 index 00000000..e07f08ea --- /dev/null +++ b/modelscope/models/audio/tts/models/utils/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from .utils import * # noqa F403 diff --git a/modelscope/models/audio/tts/models/utils/utils.py b/modelscope/models/audio/tts/models/utils/utils.py new file mode 100755 index 00000000..17ac8aee --- /dev/null +++ b/modelscope/models/audio/tts/models/utils/utils.py @@ -0,0 +1,136 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import glob +import os +import shutil + +import matplotlib +import matplotlib.pylab as plt +import torch + +matplotlib.use('Agg') + + +class AttrDict(dict): + + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow( + spectrogram, aspect='auto', origin='lower', interpolation='none') + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def plot_alignment(alignment, info=None): + fig, ax = plt.subplots() + im = ax.imshow( + alignment, aspect='auto', origin='lower', interpolation='none') + fig.colorbar(im, ax=ax) + xlabel = 'Input timestep' + if info is not None: + xlabel += '\t' + info + plt.xlabel(xlabel) + plt.ylabel('Output timestep') + fig.canvas.draw() + plt.close() + + return fig + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + checkpoint_dict = torch.load(filepath, map_location=device) + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + torch.save(obj, filepath) + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + '????????.pkl') + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +class ValueWindow(): + + def __init__(self, window_size=100): + self._window_size = window_size + self._values = [] + + def append(self, x): + self._values = self._values[-(self._window_size - 1):] + [x] + + @property + def sum(self): + return sum(self._values) + + @property + def count(self): + return len(self._values) + + @property + def average(self): + return self.sum / max(1, self.count) + + def reset(self): + self._values = [] + + +def get_model_size(model): + param_num = sum([p.numel() for p in model.parameters() if p.requires_grad]) + param_size = param_num * 4 / 1024 / 1024 + return param_size + + +def get_grad_norm(model): + total_norm = 0 + params = [ + p for p in model.parameters() if p.grad is not None and p.requires_grad + ] + for p in params: + param_norm = p.grad.detach().data.norm(2) + total_norm += param_norm.item()**2 + total_norm = total_norm**0.5 + return total_norm + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + m.weight.data.normal_(mean, std) + + +def get_mask_from_lengths(lengths, max_len=None): + batch_size = lengths.shape[0] + if max_len is None: + max_len = torch.max(lengths).item() + + ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, + -1).to(lengths.device) + mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) + + return mask diff --git a/modelscope/models/audio/tts/models/vocoder_models.py b/modelscope/models/audio/tts/models/vocoder_models.py deleted file mode 100755 index c46a9204..00000000 --- a/modelscope/models/audio/tts/models/vocoder_models.py +++ /dev/null @@ -1,516 +0,0 @@ -from distutils.version import LooseVersion - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm - -from .utils import get_padding, init_weights - -is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7') - - -def stft(x, fft_size, hop_size, win_length, window): - """Perform STFT and convert to magnitude spectrogram. - - Args: - x (Tensor): Input signal tensor (B, T). - fft_size (int): FFT size. - hop_size (int): Hop size. - win_length (int): Window length. - window (str): Window function type. - - Returns: - Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). - - """ - if is_pytorch_17plus: - x_stft = torch.stft( - x, fft_size, hop_size, win_length, window, return_complex=False) - else: - x_stft = torch.stft(x, fft_size, hop_size, win_length, window) - real = x_stft[..., 0] - imag = x_stft[..., 1] - - # NOTE(kan-bayashi): clamp is needed to avoid nan or inf - return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) - - -LRELU_SLOPE = 0.1 - - -def get_padding_casual(kernel_size, dilation=1): - return int(kernel_size * dilation - dilation) - - -class Conv1dCasual(torch.nn.Module): - - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True, - padding_mode='zeros'): - super(Conv1dCasual, self).__init__() - self.pad = padding - self.conv1d = weight_norm( - Conv1d( - in_channels, - out_channels, - kernel_size, - stride, - padding=0, - dilation=dilation, - groups=groups, - bias=bias, - padding_mode=padding_mode)) - self.conv1d.apply(init_weights) - - def forward(self, x): # bdt - # described starting from the last dimension and moving forward. - x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant') - x = self.conv1d(x) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.conv1d) - - -class ConvTranspose1dCausal(torch.nn.Module): - """CausalConvTranspose1d module with customized initialization.""" - - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride, - padding=0): - """Initialize CausalConvTranspose1d module.""" - super(ConvTranspose1dCausal, self).__init__() - self.deconv = weight_norm( - ConvTranspose1d(in_channels, out_channels, kernel_size, stride)) - self.stride = stride - self.deconv.apply(init_weights) - self.pad = kernel_size - stride - - def forward(self, x): - """Calculate forward propagation. - Args: - x (Tensor): Input tensor (B, in_channels, T_in). - Returns: - Tensor: Output tensor (B, out_channels, T_out). - """ - # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant") - return self.deconv(x)[:, :, :-self.pad] - - def remove_weight_norm(self): - remove_weight_norm(self.deconv) - - -class ResBlock1(torch.nn.Module): - - def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock1, self).__init__() - self.h = h - self.convs1 = nn.ModuleList([ - Conv1dCasual( - channels, - channels, - kernel_size, - 1, - dilation=dilation[i], - padding=get_padding_casual(kernel_size, dilation[i])) - for i in range(len(dilation)) - ]) - - self.convs2 = nn.ModuleList([ - Conv1dCasual( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding_casual(kernel_size, 1)) - for i in range(len(dilation)) - ]) - - def forward(self, x): - for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, LRELU_SLOPE) - xt = c1(xt) - xt = F.leaky_relu(xt, LRELU_SLOPE) - xt = c2(xt) - x = xt + x - return x - - def remove_weight_norm(self): - for layer in self.convs1: - layer.remove_weight_norm() - for layer in self.convs2: - layer.remove_weight_norm() - - -class Generator(torch.nn.Module): - - def __init__(self, h): - super(Generator, self).__init__() - self.h = h - self.num_kernels = len(h.resblock_kernel_sizes) - self.num_upsamples = len(h.upsample_rates) - print('num_kernels={}, num_upsamples={}'.format( - self.num_kernels, self.num_upsamples)) - self.conv_pre = Conv1dCasual( - 80, h.upsample_initial_channel, 7, 1, padding=7 - 1) - resblock = ResBlock1 if h.resblock == '1' else ResBlock2 - - self.ups = nn.ModuleList() - self.repeat_ups = nn.ModuleList() - for i, (u, k) in enumerate( - zip(h.upsample_rates, h.upsample_kernel_sizes)): - upsample = nn.Sequential( - nn.Upsample(mode='nearest', scale_factor=u), - nn.LeakyReLU(LRELU_SLOPE), - Conv1dCasual( - h.upsample_initial_channel // (2**i), - h.upsample_initial_channel // (2**(i + 1)), - kernel_size=7, - stride=1, - padding=7 - 1)) - self.repeat_ups.append(upsample) - self.ups.append( - ConvTranspose1dCausal( - h.upsample_initial_channel // (2**i), - h.upsample_initial_channel // (2**(i + 1)), - k, - u, - padding=(k - u) // 2)) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = h.upsample_initial_channel // (2**(i + 1)) - for j, (k, d) in enumerate( - zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): - self.resblocks.append(resblock(h, ch, k, d)) - - self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1) - - def forward(self, x): - x = self.conv_pre(x) - for i in range(self.num_upsamples): - x = torch.sin(x) + x - # transconv - x1 = F.leaky_relu(x, LRELU_SLOPE) - x1 = self.ups[i](x1) - # repeat - x2 = self.repeat_ups[i](x) - x = x1 + x2 - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - return x - - def remove_weight_norm(self): - print('Removing weight norm...') - for layer in self.ups: - layer.remove_weight_norm() - for layer in self.repeat_ups: - layer[-1].remove_weight_norm() - for layer in self.resblocks: - layer.remove_weight_norm() - self.conv_pre.remove_weight_norm() - self.conv_post.remove_weight_norm() - - -class DiscriminatorP(torch.nn.Module): - - def __init__(self, - period, - kernel_size=5, - stride=3, - use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - norm_f = weight_norm if use_spectral_norm is False else spectral_norm - self.convs = nn.ModuleList([ - norm_f( - Conv2d( - 1, - 32, (kernel_size, 1), (stride, 1), - padding=(get_padding(5, 1), 0))), - norm_f( - Conv2d( - 32, - 128, (kernel_size, 1), (stride, 1), - padding=(get_padding(5, 1), 0))), - norm_f( - Conv2d( - 128, - 512, (kernel_size, 1), (stride, 1), - padding=(get_padding(5, 1), 0))), - norm_f( - Conv2d( - 512, - 1024, (kernel_size, 1), (stride, 1), - padding=(get_padding(5, 1), 0))), - norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), - ]) - self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - - def forward(self, x): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - x = F.pad(x, (0, n_pad), 'reflect') - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - for layer in self.convs: - x = layer(x) - x = F.leaky_relu(x, LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class MultiPeriodDiscriminator(torch.nn.Module): - - def __init__(self): - super(MultiPeriodDiscriminator, self).__init__() - self.discriminators = nn.ModuleList([ - DiscriminatorP(2), - DiscriminatorP(3), - DiscriminatorP(5), - DiscriminatorP(7), - DiscriminatorP(11), - ]) - - def forward(self, y, y_hat): - y_d_rs = [] - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - y_d_rs.append(y_d_r) - fmap_rs.append(fmap_r) - y_d_gs.append(y_d_g) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class DiscriminatorS(torch.nn.Module): - - def __init__(self, use_spectral_norm=False): - super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm is False else spectral_norm - self.convs = nn.ModuleList([ - norm_f(Conv1d(1, 128, 15, 1, padding=7)), - norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), - norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), - norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), - norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), - ]) - self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) - - def forward(self, x): - fmap = [] - for layer in self.convs: - x = layer(x) - x = F.leaky_relu(x, LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class MultiScaleDiscriminator(torch.nn.Module): - - def __init__(self): - super(MultiScaleDiscriminator, self).__init__() - self.discriminators = nn.ModuleList([ - DiscriminatorS(use_spectral_norm=True), - DiscriminatorS(), - DiscriminatorS(), - ]) - from pytorch_wavelets import DWT1DForward - self.meanpools = nn.ModuleList( - [DWT1DForward(wave='db3', J=1), - DWT1DForward(wave='db3', J=1)]) - self.convs = nn.ModuleList([ - weight_norm(Conv1d(2, 1, 15, 1, padding=7)), - weight_norm(Conv1d(2, 1, 15, 1, padding=7)) - ]) - - def forward(self, y, y_hat): - y_d_rs = [] - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - if i != 0: - yl, yh = self.meanpools[i - 1](y) - y = torch.cat([yl, yh[0]], dim=1) - y = self.convs[i - 1](y) - y = F.leaky_relu(y, LRELU_SLOPE) - - yl_hat, yh_hat = self.meanpools[i - 1](y_hat) - y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1) - y_hat = self.convs[i - 1](y_hat) - y_hat = F.leaky_relu(y_hat, LRELU_SLOPE) - - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - y_d_rs.append(y_d_r) - fmap_rs.append(fmap_r) - y_d_gs.append(y_d_g) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class DiscriminatorSTFT(torch.nn.Module): - - def __init__(self, - kernel_size=11, - stride=2, - use_spectral_norm=False, - fft_size=1024, - shift_size=120, - win_length=600, - window='hann_window'): - super(DiscriminatorSTFT, self).__init__() - self.fft_size = fft_size - self.shift_size = shift_size - self.win_length = win_length - norm_f = weight_norm if use_spectral_norm is False else spectral_norm - self.convs = nn.ModuleList([ - norm_f( - Conv2d( - fft_size // 2 + 1, - 32, (15, 1), (1, 1), - padding=(get_padding(15, 1), 0))), - norm_f( - Conv2d( - 32, - 32, (kernel_size, 1), (stride, 1), - padding=(get_padding(9, 1), 0))), - norm_f( - Conv2d( - 32, - 32, (kernel_size, 1), (stride, 1), - padding=(get_padding(9, 1), 0))), - norm_f( - Conv2d( - 32, - 32, (kernel_size, 1), (stride, 1), - padding=(get_padding(9, 1), 0))), - norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))), - ]) - self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0))) - self.register_buffer('window', getattr(torch, window)(win_length)) - - def forward(self, wav): - wav = torch.squeeze(wav, 1) - x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length, - self.window) - x = torch.transpose(x_mag, 2, 1).unsqueeze(-1) - fmap = [] - for layer in self.convs: - x = layer(x) - x = F.leaky_relu(x, LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = x.squeeze(-1) - - return x, fmap - - -class MultiSTFTDiscriminator(torch.nn.Module): - - def __init__( - self, - fft_sizes=[1024, 2048, 512], - hop_sizes=[120, 240, 50], - win_lengths=[600, 1200, 240], - window='hann_window', - ): - super(MultiSTFTDiscriminator, self).__init__() - self.discriminators = nn.ModuleList() - for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): - self.discriminators += [ - DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl) - ] - - def forward(self, y, y_hat): - y_d_rs = [] - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - y_d_rs.append(y_d_r) - fmap_rs.append(fmap_r) - y_d_gs.append(y_d_g) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -def feature_loss(fmap_r, fmap_g): - loss = 0 - for dr, dg in zip(fmap_r, fmap_g): - for rl, gl in zip(dr, dg): - loss += torch.mean(torch.abs(rl - gl)) - - return loss * 2 - - -def discriminator_loss(disc_real_outputs, disc_generated_outputs): - loss = 0 - r_losses = [] - g_losses = [] - for dr, dg in zip(disc_real_outputs, disc_generated_outputs): - r_loss = torch.mean((1 - dr)**2) - g_loss = torch.mean(dg**2) - loss += (r_loss + g_loss) - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) - - return loss, r_losses, g_losses - - -def generator_loss(disc_outputs): - loss = 0 - gen_losses = [] - for dg in disc_outputs: - temp_loss = torch.mean((1 - dg)**2) - gen_losses.append(temp_loss) - loss += temp_loss - - return loss, gen_losses diff --git a/modelscope/models/audio/tts/sambert_hifi.py b/modelscope/models/audio/tts/sambert_hifi.py index 79f8068e..a9b55795 100644 --- a/modelscope/models/audio/tts/sambert_hifi.py +++ b/modelscope/models/audio/tts/sambert_hifi.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from __future__ import (absolute_import, division, print_function, unicode_literals) import os @@ -11,13 +13,11 @@ from modelscope.models.base import Model from modelscope.models.builder import MODELS from modelscope.utils.audio.tts_exceptions import ( TtsFrontendInitializeFailedException, - TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion, + TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationException, TtsVoiceNotExistsException) from modelscope.utils.constant import Tasks from .voice import Voice -import tensorflow as tf # isort:skip - __all__ = ['SambertHifigan'] @@ -28,14 +28,15 @@ class SambertHifigan(Model): def __init__(self, model_dir, *args, **kwargs): super().__init__(model_dir, *args, **kwargs) if 'am' not in kwargs: - raise TtsModelConfigurationExcetion( - 'configuration model field missing am!') + raise TtsModelConfigurationException( + 'modelscope error: configuration model field missing am!') if 'vocoder' not in kwargs: - raise TtsModelConfigurationExcetion( - 'configuration model field missing vocoder!') + raise TtsModelConfigurationException( + 'modelscope error: configuration model field missing vocoder!') if 'lang_type' not in kwargs: - raise TtsModelConfigurationExcetion( - 'configuration model field missing lang_type!') + raise TtsModelConfigurationException( + 'modelscope error: configuration model field missing lang_type!' + ) am_cfg = kwargs['am'] voc_cfg = kwargs['vocoder'] # initialize frontend @@ -47,10 +48,12 @@ class SambertHifigan(Model): zip_ref.extractall(model_dir) if not frontend.initialize(self.__res_path): raise TtsFrontendInitializeFailedException( - 'resource invalid: {}'.format(self.__res_path)) + 'modelscope error: resource invalid: {}'.format( + self.__res_path)) if not frontend.set_lang_type(kwargs['lang_type']): raise TtsFrontendLanguageTypeInvalidException( - 'language type invalid: {}'.format(kwargs['lang_type'])) + 'modelscope error: language type invalid: {}'.format( + kwargs['lang_type'])) self.__frontend = frontend zip_file = os.path.join(model_dir, 'voices.zip') self.__voice_path = os.path.join(model_dir, 'voices') @@ -60,7 +63,8 @@ class SambertHifigan(Model): with open(voice_cfg_path, 'r') as f: voice_cfg = json.load(f) if 'voices' not in voice_cfg: - raise TtsModelConfigurationExcetion('voices invalid') + raise TtsModelConfigurationException( + 'modelscope error: voices invalid') self.__voice = {} for name in voice_cfg['voices']: voice_path = os.path.join(self.__voice_path, name) @@ -70,11 +74,13 @@ class SambertHifigan(Model): if voice_cfg['voices']: self.__default_voice_name = voice_cfg['voices'][0] else: - raise TtsVoiceNotExistsException('voices is empty in voices.json') + raise TtsVoiceNotExistsException( + 'modelscope error: voices is empty in voices.json') def __synthesis_one_sentences(self, voice_name, text): if voice_name not in self.__voice: - raise TtsVoiceNotExistsException(f'Voice {voice_name} not exists') + raise TtsVoiceNotExistsException( + f'modelscope error: Voice {voice_name} not exists') return self.__voice[voice_name].forward(text) def forward(self, text: str, voice_name: str = None): diff --git a/modelscope/models/audio/tts/text/cleaners.py b/modelscope/models/audio/tts/text/cleaners.py deleted file mode 100755 index 19d838d1..00000000 --- a/modelscope/models/audio/tts/text/cleaners.py +++ /dev/null @@ -1,89 +0,0 @@ -''' -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -''' - -import re - -from unidecode import unidecode - -from .numbers import normalize_numbers - -# Regular expression matching whitespace: -_whitespace_re = re.compile(r'\s+') - -# List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) - for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), ]] # yapf:disable - - -def expand_abbreviations(text): - for regex, replacement in _abbreviations: - text = re.sub(regex, replacement, text) - return text - - -def expand_numbers(text): - return normalize_numbers(text) - - -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, ' ', text) - - -def convert_to_ascii(text): - return unidecode(text) - - -def basic_cleaners(text): - '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def transliteration_cleaners(text): - '''Pipeline for non-English text that transliterates to ASCII.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def english_cleaners(text): - '''Pipeline for English text, including number and abbreviation expansion.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = expand_numbers(text) - text = expand_abbreviations(text) - text = collapse_whitespace(text) - return text diff --git a/modelscope/models/audio/tts/text/cmudict.py b/modelscope/models/audio/tts/text/cmudict.py deleted file mode 100755 index b4da4be9..00000000 --- a/modelscope/models/audio/tts/text/cmudict.py +++ /dev/null @@ -1,64 +0,0 @@ -import re - -valid_symbols = [ - 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', - 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', - 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', - 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', - 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', - 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', - 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', - 'Y', 'Z', 'ZH' -] - -_valid_symbol_set = set(valid_symbols) - - -class CMUDict: - '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' - - def __init__(self, file_or_path, keep_ambiguous=True): - if isinstance(file_or_path, str): - with open(file_or_path, encoding='latin-1') as f: - entries = _parse_cmudict(f) - else: - entries = _parse_cmudict(file_or_path) - if not keep_ambiguous: - entries = { - word: pron - for word, pron in entries.items() if len(pron) == 1 - } - self._entries = entries - - def __len__(self): - return len(self._entries) - - def lookup(self, word): - '''Returns list of ARPAbet pronunciations of the given word.''' - return self._entries.get(word.upper()) - - -_alt_re = re.compile(r'\([0-9]+\)') - - -def _parse_cmudict(file): - cmudict = {} - for line in file: - if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): - parts = line.split(' ') - word = re.sub(_alt_re, '', parts[0]) - pronunciation = _get_pronunciation(parts[1]) - if pronunciation: - if word in cmudict: - cmudict[word].append(pronunciation) - else: - cmudict[word] = [pronunciation] - return cmudict - - -def _get_pronunciation(s): - parts = s.strip().split(' ') - for part in parts: - if part not in _valid_symbol_set: - return None - return ' '.join(parts) diff --git a/modelscope/models/audio/tts/text/symbols.py b/modelscope/models/audio/tts/text/symbols.py deleted file mode 100644 index 63975abb..00000000 --- a/modelscope/models/audio/tts/text/symbols.py +++ /dev/null @@ -1,105 +0,0 @@ -''' -Defines the set of symbols used in text input to the model. - -The default is a set of ASCII characters that works well for English or text that has been run -through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. -''' -import codecs -import os - -_pad = '_' -_eos = '~' -_mask = '@[MASK]' - - -def load_symbols(dict_path, has_mask=True): - _characters = '' - _ch_symbols = [] - sy_dict_name = 'sy_dict.txt' - sy_dict_path = os.path.join(dict_path, sy_dict_name) - f = codecs.open(sy_dict_path, 'r') - for line in f: - line = line.strip('\r\n') - _ch_symbols.append(line) - - _arpabet = ['@' + s for s in _ch_symbols] - - # Export all symbols: - sy = list(_characters) + _arpabet + [_pad, _eos] - if has_mask: - sy.append(_mask) - - _characters = '' - - _ch_tones = [] - tone_dict_name = 'tone_dict.txt' - tone_dict_path = os.path.join(dict_path, tone_dict_name) - f = codecs.open(tone_dict_path, 'r') - for line in f: - line = line.strip('\r\n') - _ch_tones.append(line) - - # Export all tones: - tone = list(_characters) + _ch_tones + [_pad, _eos] - if has_mask: - tone.append(_mask) - - _characters = '' - - _ch_syllable_flags = [] - syllable_flag_name = 'syllable_flag_dict.txt' - syllable_flag_path = os.path.join(dict_path, syllable_flag_name) - f = codecs.open(syllable_flag_path, 'r') - for line in f: - line = line.strip('\r\n') - _ch_syllable_flags.append(line) - - # Export all syllable_flags: - syllable_flag = list(_characters) + _ch_syllable_flags + [_pad, _eos] - if has_mask: - syllable_flag.append(_mask) - - _characters = '' - - _ch_word_segments = [] - word_segment_name = 'word_segment_dict.txt' - word_segment_path = os.path.join(dict_path, word_segment_name) - f = codecs.open(word_segment_path, 'r') - for line in f: - line = line.strip('\r\n') - _ch_word_segments.append(line) - - # Export all syllable_flags: - word_segment = list(_characters) + _ch_word_segments + [_pad, _eos] - if has_mask: - word_segment.append(_mask) - - _characters = '' - - _ch_emo_types = [] - emo_category_name = 'emo_category_dict.txt' - emo_category_path = os.path.join(dict_path, emo_category_name) - f = codecs.open(emo_category_path, 'r') - for line in f: - line = line.strip('\r\n') - _ch_emo_types.append(line) - - emo_category = list(_characters) + _ch_emo_types + [_pad, _eos] - if has_mask: - emo_category.append(_mask) - - _characters = '' - - _ch_speakers = [] - speaker_name = 'speaker_dict.txt' - speaker_path = os.path.join(dict_path, speaker_name) - f = codecs.open(speaker_path, 'r') - for line in f: - line = line.strip('\r\n') - _ch_speakers.append(line) - - # Export all syllable_flags: - speaker = list(_characters) + _ch_speakers + [_pad, _eos] - if has_mask: - speaker.append(_mask) - return sy, tone, syllable_flag, word_segment, emo_category, speaker diff --git a/modelscope/models/audio/tts/text/symbols_dict.py b/modelscope/models/audio/tts/text/symbols_dict.py deleted file mode 100644 index e8f7ed19..00000000 --- a/modelscope/models/audio/tts/text/symbols_dict.py +++ /dev/null @@ -1,200 +0,0 @@ -import re -import sys - -from .cleaners import (basic_cleaners, english_cleaners, - transliteration_cleaners) - - -class SymbolsDict: - - def __init__(self, sy, tone, syllable_flag, word_segment, emo_category, - speaker, inputs_dim, lfeat_type_list): - self._inputs_dim = inputs_dim - self._lfeat_type_list = lfeat_type_list - self._sy_to_id = {s: i for i, s in enumerate(sy)} - self._id_to_sy = {i: s for i, s in enumerate(sy)} - self._tone_to_id = {s: i for i, s in enumerate(tone)} - self._id_to_tone = {i: s for i, s in enumerate(tone)} - self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)} - self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)} - self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)} - self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)} - self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)} - self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)} - self._speaker_to_id = {s: i for i, s in enumerate(speaker)} - self._id_to_speaker = {i: s for i, s in enumerate(speaker)} - print('_sy_to_id: ') - print(self._sy_to_id) - print('_tone_to_id: ') - print(self._tone_to_id) - print('_syllable_flag_to_id: ') - print(self._syllable_flag_to_id) - print('_word_segment_to_id: ') - print(self._word_segment_to_id) - print('_emo_category_to_id: ') - print(self._emo_category_to_id) - print('_speaker_to_id: ') - print(self._speaker_to_id) - self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') - self._cleaners = { - basic_cleaners.__name__: basic_cleaners, - transliteration_cleaners.__name__: transliteration_cleaners, - english_cleaners.__name__: english_cleaners - } - - def _clean_text(self, text, cleaner_names): - for name in cleaner_names: - cleaner = self._cleaners.get(name) - if not cleaner: - raise Exception('Unknown cleaner: %s' % name) - text = cleaner(text) - return text - - def _sy_to_sequence(self, sy): - return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)] - - def _arpabet_to_sequence(self, text): - return self._sy_to_sequence(['@' + s for s in text.split()]) - - def _should_keep_sy(self, s): - return s in self._sy_to_id and s != '_' and s != '~' - - def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names): - sequence = [] - if lfeat_type == 'sy': - this_lfeat_symbol = this_lfeat_symbol.strip().split(' ') - this_lfeat_symbol_format = '' - index = 0 - while index < len(this_lfeat_symbol): - this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[ - index] + '}' + ' ' - index = index + 1 - sequence = self.text_to_sequence(this_lfeat_symbol_format, - cleaner_names) - elif lfeat_type == 'tone': - sequence = self.tone_to_sequence(this_lfeat_symbol) - elif lfeat_type == 'syllable_flag': - sequence = self.syllable_flag_to_sequence(this_lfeat_symbol) - elif lfeat_type == 'word_segment': - sequence = self.word_segment_to_sequence(this_lfeat_symbol) - elif lfeat_type == 'emo_category': - sequence = self.emo_category_to_sequence(this_lfeat_symbol) - elif lfeat_type == 'speaker': - sequence = self.speaker_to_sequence(this_lfeat_symbol) - else: - raise Exception('Unknown lfeat type: %s' % lfeat_type) - - return sequence - - def text_to_sequence(self, text, cleaner_names): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - - The text can optionally have ARPAbet sequences enclosed in curly braces embedded - in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." - - Args: - text: string to convert to a sequence - cleaner_names: names of the cleaner functions to run the text through - - Returns: - List of integers corresponding to the symbols in the text - ''' - sequence = [] - - # Check for curly braces and treat their contents as ARPAbet: - while len(text): - m = self._curly_re.match(text) - if not m: - sequence += self._sy_to_sequence( - self._clean_text(text, cleaner_names)) - break - sequence += self._sy_to_sequence( - self._clean_text(m.group(1), cleaner_names)) - sequence += self._arpabet_to_sequence(m.group(2)) - text = m.group(3) - - # Append EOS token - sequence.append(self._sy_to_id['~']) - return sequence - - def tone_to_sequence(self, tone): - tones = tone.strip().split(' ') - sequence = [] - for this_tone in tones: - sequence.append(self._tone_to_id[this_tone]) - sequence.append(self._tone_to_id['~']) - return sequence - - def syllable_flag_to_sequence(self, syllable_flag): - syllable_flags = syllable_flag.strip().split(' ') - sequence = [] - for this_syllable_flag in syllable_flags: - sequence.append(self._syllable_flag_to_id[this_syllable_flag]) - sequence.append(self._syllable_flag_to_id['~']) - return sequence - - def word_segment_to_sequence(self, word_segment): - word_segments = word_segment.strip().split(' ') - sequence = [] - for this_word_segment in word_segments: - sequence.append(self._word_segment_to_id[this_word_segment]) - sequence.append(self._word_segment_to_id['~']) - return sequence - - def emo_category_to_sequence(self, emo_type): - emo_categories = emo_type.strip().split(' ') - sequence = [] - for this_category in emo_categories: - sequence.append(self._emo_category_to_id[this_category]) - sequence.append(self._emo_category_to_id['~']) - return sequence - - def speaker_to_sequence(self, speaker): - speakers = speaker.strip().split(' ') - sequence = [] - for this_speaker in speakers: - sequence.append(self._speaker_to_id[this_speaker]) - sequence.append(self._speaker_to_id['~']) - return sequence - - def sequence_to_symbol(self, sequence): - result = '' - pre_lfeat_dim = 0 - for lfeat_type in self._lfeat_type_list: - current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim - + self._inputs_dim[lfeat_type]] - current_sequence = current_one_hot_sequence.argmax(1) - length = current_sequence.shape[0] - - index = 0 - while index < length: - this_sequence = current_sequence[index] - s = '' - if lfeat_type == 'sy': - s = self._id_to_sy[this_sequence] - if len(s) > 1 and s[0] == '@': - s = s[1:] - elif lfeat_type == 'tone': - s = self._id_to_tone[this_sequence] - elif lfeat_type == 'syllable_flag': - s = self._id_to_syllable_flag[this_sequence] - elif lfeat_type == 'word_segment': - s = self._id_to_word_segment[this_sequence] - elif lfeat_type == 'emo_category': - s = self._id_to_emo_category[this_sequence] - elif lfeat_type == 'speaker': - s = self._id_to_speaker[this_sequence] - else: - raise Exception('Unknown lfeat type: %s' % lfeat_type) - - if index == 0: - result = result + lfeat_type + ': ' - - result = result + '{' + s + '}' - - if index == length - 1: - result = result + '; ' - - index = index + 1 - pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type] - return result diff --git a/modelscope/models/audio/tts/voice.py b/modelscope/models/audio/tts/voice.py index deaebf11..dc830db5 100644 --- a/modelscope/models/audio/tts/voice.py +++ b/modelscope/models/audio/tts/voice.py @@ -1,286 +1,111 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os +import pickle as pkl import json import numpy as np import torch -from sklearn.preprocessing import MultiLabelBinarizer +from modelscope.utils.audio.tts_exceptions import \ + TtsModelConfigurationException from modelscope.utils.constant import ModelFile, Tasks -from .models import Generator, create_am_model -from .text.symbols import load_symbols -from .text.symbols_dict import SymbolsDict - -import tensorflow as tf # isort:skip +from .models.datasets.units import KanTtsLinguisticUnit +from .models.models.hifigan import Generator +from .models.models.sambert import KanTtsSAMBERT +from .models.utils import (AttrDict, build_env, init_weights, load_checkpoint, + plot_spectrogram, save_checkpoint, scan_checkpoint) MAX_WAV_VALUE = 32768.0 -def multi_label_symbol_to_sequence(my_classes, my_symbol): - one_hot = MultiLabelBinarizer(classes=my_classes) - tokens = my_symbol.strip().split(' ') - sequences = [] - for token in tokens: - sequences.append(tuple(token.split('&'))) - return one_hot.fit_transform(sequences) - - -def load_checkpoint(filepath, device): - assert os.path.isfile(filepath) - checkpoint_dict = torch.load(filepath, map_location=device) - return checkpoint_dict - - -class AttrDict(dict): - - def __init__(self, *args, **kwargs): - super(AttrDict, self).__init__(*args, **kwargs) - self.__dict__ = self - - class Voice: - def __init__(self, voice_name, voice_path, am_hparams, voc_config): + def __init__(self, voice_name, voice_path, am_config, voc_config): self.__voice_name = voice_name self.__voice_path = voice_path - self.__am_hparams = tf.contrib.training.HParams(**am_hparams) + self.__am_config = AttrDict(**am_config) self.__voc_config = AttrDict(**voc_config) self.__model_loaded = False + if 'am' not in self.__am_config: + raise TtsModelConfigurationException( + 'modelscope error: am configuration invalid') + if 'linguistic_unit' not in self.__am_config: + raise TtsModelConfigurationException( + 'modelscope error: am configuration invalid') + self.__am_lingustic_unit_config = self.__am_config['linguistic_unit'] def __load_am(self): - local_am_ckpt_path = os.path.join(self.__voice_path, - ModelFile.TF_CHECKPOINT_FOLDER) - self.__am_ckpt_path = os.path.join(local_am_ckpt_path, 'ckpt') - self.__dict_path = os.path.join(self.__voice_path, 'dicts') + local_am_ckpt_path = os.path.join(self.__voice_path, 'am') + self.__am_ckpt_path = os.path.join(local_am_ckpt_path, + ModelFile.TORCH_MODEL_BIN_FILE) has_mask = True - if self.__am_hparams.get('has_mask') is not None: - has_mask = self.__am_hparams.has_mask - model_name = 'robutrans' - self.__lfeat_type_list = self.__am_hparams.lfeat_type_list.strip( - ).split(',') - sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols( - self.__dict_path, has_mask) - self.__sy = sy - self.__tone = tone - self.__syllable_flag = syllable_flag - self.__word_segment = word_segment - self.__emo_category = emo_category - self.__speaker = speaker - self.__inputs_dim = dict() - for lfeat_type in self.__lfeat_type_list: - if lfeat_type == 'sy': - self.__inputs_dim[lfeat_type] = len(sy) - elif lfeat_type == 'tone': - self.__inputs_dim[lfeat_type] = len(tone) - elif lfeat_type == 'syllable_flag': - self.__inputs_dim[lfeat_type] = len(syllable_flag) - elif lfeat_type == 'word_segment': - self.__inputs_dim[lfeat_type] = len(word_segment) - elif lfeat_type == 'emo_category': - self.__inputs_dim[lfeat_type] = len(emo_category) - elif lfeat_type == 'speaker': - self.__inputs_dim[lfeat_type] = len(speaker) - - self.__symbols_dict = SymbolsDict(sy, tone, syllable_flag, - word_segment, emo_category, speaker, - self.__inputs_dim, - self.__lfeat_type_list) - dim_inputs = sum(self.__inputs_dim.values( - )) - self.__inputs_dim['speaker'] - self.__inputs_dim['emo_category'] - self.__graph = tf.Graph() - with self.__graph.as_default(): - inputs = tf.placeholder(tf.float32, [1, None, dim_inputs], - 'inputs') - inputs_emotion = tf.placeholder( - tf.float32, [1, None, self.__inputs_dim['emo_category']], - 'inputs_emotion') - inputs_speaker = tf.placeholder( - tf.float32, [1, None, self.__inputs_dim['speaker']], - 'inputs_speaker') - input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') - pitch_contours_scale = tf.placeholder(tf.float32, [1, None], - 'pitch_contours_scale') - energy_contours_scale = tf.placeholder(tf.float32, [1, None], - 'energy_contours_scale') - duration_scale = tf.placeholder(tf.float32, [1, None], - 'duration_scale') - with tf.variable_scope('model') as _: - self.__model = create_am_model(model_name, self.__am_hparams) - self.__model.initialize( - inputs, - inputs_emotion, - inputs_speaker, - input_lengths, - duration_scales=duration_scale, - pitch_scales=pitch_contours_scale, - energy_scales=energy_contours_scale) - self.__mel_spec = self.__model.mel_outputs[0] - self.__duration_outputs = self.__model.duration_outputs[0] - self.__duration_outputs_ = self.__model.duration_outputs_[0] - self.__pitch_contour_outputs = self.__model.pitch_contour_outputs[ - 0] - self.__energy_contour_outputs = self.__model.energy_contour_outputs[ - 0] - self.__embedded_inputs_emotion = self.__model.embedded_inputs_emotion[ - 0] - self.__embedding_fsmn_outputs = self.__model.embedding_fsmn_outputs[ - 0] - self.__encoder_outputs = self.__model.encoder_outputs[0] - self.__pitch_embeddings = self.__model.pitch_embeddings[0] - self.__energy_embeddings = self.__model.energy_embeddings[0] - self.__LR_outputs = self.__model.LR_outputs[0] - self.__postnet_fsmn_outputs = self.__model.postnet_fsmn_outputs[ - 0] - self.__attention_h = self.__model.attention_h - self.__attention_x = self.__model.attention_x - - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - self.__session = tf.Session(config=config) - self.__session.run(tf.global_variables_initializer()) - - saver = tf.train.Saver() - saver.restore(self.__session, self.__am_ckpt_path) + if 'has_mask' in self.__am_lingustic_unit_config: + has_mask = self.__am_lingustic_unit_config.has_mask + self.__ling_unit = KanTtsLinguisticUnit( + self.__am_lingustic_unit_config, self.__voice_path, has_mask) + self.__am_net = KanTtsSAMBERT(self.__am_config, + self.__ling_unit.get_unit_size()).to( + self.__device) + state_dict_g = {} + try: + state_dict_g = load_checkpoint(self.__am_ckpt_path, self.__device) + except RuntimeError: + with open(self.__am_ckpt_path, 'rb') as f: + pth_var_dict = pkl.load(f) + state_dict_g['fsnet'] = { + k: torch.FloatTensor(v) + for k, v in pth_var_dict['fsnet'].items() + } + self.__am_net.load_state_dict(state_dict_g['fsnet'], strict=False) + self.__am_net.eval() def __load_vocoder(self): - self.__voc_ckpt_path = os.path.join(self.__voice_path, + local_voc_ckpy_path = os.path.join(self.__voice_path, 'vocoder') + self.__voc_ckpt_path = os.path.join(local_voc_ckpy_path, ModelFile.TORCH_MODEL_BIN_FILE) - if torch.cuda.is_available(): - torch.manual_seed(self.__voc_config.seed) - self.__device = torch.device('cuda') - else: - self.__device = torch.device('cpu') self.__generator = Generator(self.__voc_config).to(self.__device) state_dict_g = load_checkpoint(self.__voc_ckpt_path, self.__device) self.__generator.load_state_dict(state_dict_g['generator']) self.__generator.eval() self.__generator.remove_weight_norm() - def __am_forward(self, - text, - pitch_control_str='', - duration_control_str='', - energy_control_str=''): - duration_cfg_lst = [] - if len(duration_control_str) != 0: - for item in duration_control_str.strip().split('|'): - percent, scale = item.lstrip('(').rstrip(')').split(',') - duration_cfg_lst.append((float(percent), float(scale))) - pitch_contours_cfg_lst = [] - if len(pitch_control_str) != 0: - for item in pitch_control_str.strip().split('|'): - percent, scale = item.lstrip('(').rstrip(')').split(',') - pitch_contours_cfg_lst.append((float(percent), float(scale))) - energy_contours_cfg_lst = [] - if len(energy_control_str) != 0: - for item in energy_control_str.strip().split('|'): - percent, scale = item.lstrip('(').rstrip(')').split(',') - energy_contours_cfg_lst.append((float(percent), float(scale))) - cleaner_names = [ - x.strip() for x in self.__am_hparams.cleaners.split(',') - ] - - lfeat_symbol = text.strip().split(' ') - lfeat_symbol_separate = [''] * int(len(self.__lfeat_type_list)) - for this_lfeat_symbol in lfeat_symbol: - this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split( - '$') - if len(this_lfeat_symbol) != len(self.__lfeat_type_list): - raise Exception( - 'Length of this_lfeat_symbol in training data' - + ' is not equal to the length of lfeat_type_list, ' - + str(len(this_lfeat_symbol)) + ' VS. ' - + str(len(self.__lfeat_type_list))) - index = 0 - while index < len(lfeat_symbol_separate): - lfeat_symbol_separate[index] = lfeat_symbol_separate[ - index] + this_lfeat_symbol[index] + ' ' - index = index + 1 - - index = 0 - lfeat_type = self.__lfeat_type_list[index] - sequence = self.__symbols_dict.symbol_to_sequence( - lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names) - sequence_array = np.asarray( - sequence[:-1], - dtype=np.int32) # sequence length minus 1 to ignore EOS ~ - inputs = np.eye( - self.__inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] - index = index + 1 - while index < len(self.__lfeat_type_list) - 2: - lfeat_type = self.__lfeat_type_list[index] - sequence = self.__symbols_dict.symbol_to_sequence( - lfeat_symbol_separate[index].strip(), lfeat_type, - cleaner_names) - sequence_array = np.asarray( - sequence[:-1], - dtype=np.int32) # sequence length minus 1 to ignore EOS ~ - inputs_temp = np.eye( - self.__inputs_dim[lfeat_type], - dtype=np.float32)[sequence_array] - inputs = np.concatenate((inputs, inputs_temp), axis=1) - index = index + 1 - seq = inputs - - lfeat_type = 'emo_category' - inputs_emotion = multi_label_symbol_to_sequence( - self.__emo_category, lfeat_symbol_separate[index].strip()) - # inputs_emotion = inputs_emotion * 1.5 - index = index + 1 - - lfeat_type = 'speaker' - inputs_speaker = multi_label_symbol_to_sequence( - self.__speaker, lfeat_symbol_separate[index].strip()) - - duration_scale = np.ones((len(seq), ), dtype=np.float32) - start_idx = 0 - for (percent, scale) in duration_cfg_lst: - duration_scale[start_idx:start_idx - + int(percent * len(seq))] = scale - start_idx += int(percent * len(seq)) - - pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32) - start_idx = 0 - for (percent, scale) in pitch_contours_cfg_lst: - pitch_contours_scale[start_idx:start_idx - + int(percent * len(seq))] = scale - start_idx += int(percent * len(seq)) - - energy_contours_scale = np.ones((len(seq), ), dtype=np.float32) - start_idx = 0 - for (percent, scale) in energy_contours_cfg_lst: - energy_contours_scale[start_idx:start_idx - + int(percent * len(seq))] = scale - start_idx += int(percent * len(seq)) - - feed_dict = { - self.__model.inputs: [np.asarray(seq, dtype=np.float32)], - self.__model.inputs_emotion: - [np.asarray(inputs_emotion, dtype=np.float32)], - self.__model.inputs_speaker: - [np.asarray(inputs_speaker, dtype=np.float32)], - self.__model.input_lengths: - np.asarray([len(seq)], dtype=np.int32), - self.__model.duration_scales: [duration_scale], - self.__model.pitch_scales: [pitch_contours_scale], - self.__model.energy_scales: [energy_contours_scale] - } - - result = self.__session.run([ - self.__mel_spec, self.__duration_outputs, self.__duration_outputs_, - self.__pitch_contour_outputs, self.__embedded_inputs_emotion, - self.__embedding_fsmn_outputs, self.__encoder_outputs, - self.__pitch_embeddings, self.__LR_outputs, - self.__postnet_fsmn_outputs, self.__energy_contour_outputs, - self.__energy_embeddings, self.__attention_x, self.__attention_h - ], feed_dict=feed_dict) # yapf:disable - return result[0] + def __am_forward(self, symbol_seq): + with torch.no_grad(): + inputs_feat_lst = self.__ling_unit.encode_symbol_sequence( + symbol_seq) + inputs_sy = torch.from_numpy(inputs_feat_lst[0]).long().to( + self.__device) + inputs_tone = torch.from_numpy(inputs_feat_lst[1]).long().to( + self.__device) + inputs_syllable = torch.from_numpy(inputs_feat_lst[2]).long().to( + self.__device) + inputs_ws = torch.from_numpy(inputs_feat_lst[3]).long().to( + self.__device) + inputs_ling = torch.stack( + [inputs_sy, inputs_tone, inputs_syllable, inputs_ws], + dim=-1).unsqueeze(0) + inputs_emo = torch.from_numpy(inputs_feat_lst[4]).long().to( + self.__device).unsqueeze(0) + inputs_spk = torch.from_numpy(inputs_feat_lst[5]).long().to( + self.__device).unsqueeze(0) + inputs_len = torch.zeros(1).to(self.__device).long( + ) + inputs_emo.size(1) - 1 # minus 1 for "~" + res = self.__am_net(inputs_ling[:, :-1, :], inputs_emo[:, :-1], + inputs_spk[:, :-1], inputs_len) + postnet_outputs = res['postnet_outputs'] + LR_length_rounded = res['LR_length_rounded'] + valid_length = int(LR_length_rounded[0].item()) + postnet_outputs = postnet_outputs[ + 0, :valid_length, :].cpu().numpy() + return postnet_outputs def __vocoder_forward(self, melspec): dim0 = list(melspec.shape)[-1] if dim0 != self.__voc_config.num_mels: raise TtsVocoderMelspecShapeMismatchException( - 'input melspec mismatch require {} but {}'.format( - self.__voc_config.num_mels, dim0)) + 'modelscope error: input melspec mismatch require {} but {}'. + format(self.__voc_config.num_mels, dim0)) with torch.no_grad(): x = melspec.T x = torch.FloatTensor(x).to(self.__device) @@ -292,9 +117,15 @@ class Voice: audio = audio.cpu().numpy().astype('int16') return audio - def forward(self, text): + def forward(self, symbol_seq): if not self.__model_loaded: + torch.manual_seed(self.__am_config.seed) + if torch.cuda.is_available(): + torch.manual_seed(self.__am_config.seed) + self.__device = torch.device('cuda') + else: + self.__device = torch.device('cpu') self.__load_am() self.__load_vocoder() self.__model_loaded = True - return self.__vocoder_forward(self.__am_forward(text)) + return self.__vocoder_forward(self.__am_forward(symbol_seq)) diff --git a/modelscope/pipelines/audio/text_to_speech_pipeline.py b/modelscope/pipelines/audio/text_to_speech_pipeline.py index f9e7d80a..2063da68 100644 --- a/modelscope/pipelines/audio/text_to_speech_pipeline.py +++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + from typing import Any, Dict, List import numpy as np @@ -42,3 +44,6 @@ class TextToSpeechSambertHifiganPipeline(Pipeline): def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: return inputs + + def _sanitize_parameters(self, **pipeline_parameters): + return {}, pipeline_parameters, {} diff --git a/modelscope/utils/audio/tts_exceptions.py b/modelscope/utils/audio/tts_exceptions.py index 8c73b603..43ec994b 100644 --- a/modelscope/utils/audio/tts_exceptions.py +++ b/modelscope/utils/audio/tts_exceptions.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. """ Define TTS exceptions """ @@ -10,7 +11,7 @@ class TtsException(Exception): pass -class TtsModelConfigurationExcetion(TtsException): +class TtsModelConfigurationException(TtsException): """ TTS model configuration exceptions. """ diff --git a/requirements/audio.txt b/requirements/audio.txt index 5e4bc104..d22ad8f1 100644 --- a/requirements/audio.txt +++ b/requirements/audio.txt @@ -1,6 +1,5 @@ easyasr>=0.0.2 espnet>=202204 -#tts h5py inflect keras @@ -15,11 +14,7 @@ nltk numpy<=1.18 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged. protobuf>3,<3.21.0 -ptflops py_sound_connect -pytorch_wavelets -PyWavelets>=1.0.0 -scikit-learn SoundFile>0.10 sox torchaudio diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py index e82cf43e..f659e59b 100644 --- a/tests/pipelines/test_text_to_speech.py +++ b/tests/pipelines/test_text_to_speech.py @@ -9,6 +9,7 @@ import unittest import torch from scipy.io.wavfile import write +from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks @@ -33,7 +34,9 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase, text = '今天北京天气怎么样?' voice = 'zhitian_emo' - sambert_hifigan_tts = pipeline(task=self.task, model=self.model_id) + model = Model.from_pretrained( + model_name_or_path=self.model_id, revision='pytorch_am') + sambert_hifigan_tts = pipeline(task=self.task, model=model) self.assertTrue(sambert_hifigan_tts is not None) output = sambert_hifigan_tts(input=text, voice=voice) self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM]) From b98114367bb8f3e383cb101d329cc85481264ee3 Mon Sep 17 00:00:00 2001 From: "shuying.shu" Date: Tue, 27 Sep 2022 22:15:24 +0800 Subject: [PATCH 12/23] [to #42322933]add timestamp for movie scene segmentation output Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10269467 * add timestamp for movie scene segmentation output --- .../models/audio/tts/models/datasets/__init__.py | 0 .../cv/movie_scene_segmentation/utils/save_op.py | 12 ++++++++---- modelscope/outputs.py | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) mode change 100644 => 100755 modelscope/models/audio/tts/models/datasets/__init__.py diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py old mode 100644 new mode 100755 diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py index 6361c056..b350ff13 100644 --- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py +++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py @@ -26,7 +26,8 @@ def pred2scene(shot2keyf, anno_dict): for scene_ind, scene_item in enumerate(scene_list): scene_dict_lst.append({ 'shot': pair_list[scene_ind], - 'frame': scene_item + 'frame': scene_item[0], + 'timestamp': scene_item[1] }) return scene_dict_lst, scene_list @@ -42,8 +43,8 @@ def scene2video(source_movie_fn, scene_list, thres): for scene_ind, scene_item in tqdm(enumerate(scene_list)): scene = str(scene_ind).zfill(4) - start_frame = int(scene_item[0]) - end_frame = int(scene_item[1]) + start_frame = int(scene_item[0][0]) + end_frame = int(scene_item[0][1]) start_time, end_time = start_frame / fps, end_frame / fps duration_time = end_time - start_time out_video_fn = os.path.join(out_video_dir_fn, @@ -71,7 +72,10 @@ def get_demo_scene_list(shot2keyf, anno_dict): start_shot, end_shot = int(pair[0]), int(pair[-1]) start_frame = shot2keyf[start_shot].split(' ')[0] end_frame = shot2keyf[end_shot].split(' ')[1] - scene_list.append((start_frame, end_frame)) + start_timestamp = shot2keyf[start_shot].split(' ')[-2] + end_timestamp = shot2keyf[end_shot].split(' ')[-1] + scene_list.append([[start_frame, end_frame], + [start_timestamp, end_timestamp]]) return scene_list, pair_list diff --git a/modelscope/outputs.py b/modelscope/outputs.py index b19f7e43..d80ba9c5 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -303,7 +303,8 @@ TASK_OUTPUTS = { # [ # { # "shot": [0,1,2], - # "frame": [start_frame, end_frame] + # "frame": [start_frame, end_frame], + # "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] # } # ] # From 939a9f232242684dc86f463ac294c14beaa99f3e Mon Sep 17 00:00:00 2001 From: "wendi.hwd" Date: Tue, 27 Sep 2022 22:17:41 +0800 Subject: [PATCH 13/23] [to #42322933]fix commits Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10272768 --- modelscope/outputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/outputs.py b/modelscope/outputs.py index d80ba9c5..92e3410b 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -153,7 +153,7 @@ TASK_OUTPUTS = { # semantic segmentation result for single sample # { - # "masks": [np.array # 2D array containing only 0, 255] + # "masks": [np.array # 2D array with shape [height, width]] # } Tasks.semantic_segmentation: [OutputKeys.MASKS], From 744c84c89302728d0d6bfaca411d00abdee5b310 Mon Sep 17 00:00:00 2001 From: "lanjinpeng.ljp" Date: Tue, 27 Sep 2022 22:19:14 +0800 Subject: [PATCH 14/23] output timestamps for video-single-object-tracking demo service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 830版本 video-single-object-tracking demo需要输出timestamps信息 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10278969 --- .../cv/video_single_object_tracking/utils/utils.py | 7 +++++++ modelscope/outputs.py | 6 ++++-- .../cv/video_single_object_tracking_pipeline.py | 11 +++++++++-- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/modelscope/models/cv/video_single_object_tracking/utils/utils.py b/modelscope/models/cv/video_single_object_tracking/utils/utils.py index 752ec272..90513a2a 100644 --- a/modelscope/models/cv/video_single_object_tracking/utils/utils.py +++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py @@ -238,3 +238,10 @@ def check_box(box: list, image_height, image_width) -> bool: if box[3] < 0 or box[3] >= image_height: return False return True + + +def timestamp_format(seconds): + m, s = divmod(seconds, 60) + h, m = divmod(m, 60) + time = '%02d:%02d:%06.3f' % (h, m, s) + return time diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 92e3410b..b96f38d3 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -247,9 +247,11 @@ TASK_OUTPUTS = { # [x1, y1, x2, y2], # [x1, y1, x2, y2], # [x1, y1, x2, y2], - # ] + # ], + # "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"] # } - Tasks.video_single_object_tracking: [OutputKeys.BOXES], + Tasks.video_single_object_tracking: + [OutputKeys.BOXES, OutputKeys.TIMESTAMPS], # live category recognition result for single video # { diff --git a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py index c47fc15f..4169def7 100644 --- a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py +++ b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py @@ -9,8 +9,8 @@ from modelscope.models.cv.video_single_object_tracking.config.ostrack import \ cfg from modelscope.models.cv.video_single_object_tracking.tracker.ostrack import \ OSTrack -from modelscope.models.cv.video_single_object_tracking.utils.utils import \ - check_box +from modelscope.models.cv.video_single_object_tracking.utils.utils import ( + check_box, timestamp_format) from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, Pipeline from modelscope.pipelines.builder import PIPELINES @@ -45,7 +45,10 @@ class VideoSingleObjectTrackingPipeline(Pipeline): def forward(self, input: Input) -> Dict[str, Any]: output_boxes = [] + output_timestamps = [] cap = cv2.VideoCapture(self.video_path) + fps = cap.get(cv2.CAP_PROP_FPS) + frame_idx = 0 success, frame = cap.read() if success is False: raise Exception( @@ -58,6 +61,7 @@ class VideoSingleObjectTrackingPipeline(Pipeline): raise Exception('modelscope error: init_box out of image range ', init_box) output_boxes.append(init_box.copy()) + output_timestamps.append(timestamp_format(seconds=frame_idx / fps)) init_box[2] = init_box[2] - init_box[0] init_box[3] = init_box[3] - init_box[1] self.tracker.initialize(frame, {'init_bbox': init_box}) @@ -67,14 +71,17 @@ class VideoSingleObjectTrackingPipeline(Pipeline): ret, frame = cap.read() if frame is None: break + frame_idx += 1 out = self.tracker.track(frame) state = [int(s) for s in out['target_bbox']] output_boxes.append(state) + output_timestamps.append(timestamp_format(seconds=frame_idx / fps)) cap.release() logger.info('tracking process done') return { OutputKeys.BOXES: output_boxes, + OutputKeys.TIMESTAMPS: output_timestamps } def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: From 357a233ee32bbaec7eaef58f383d86219b3f9cd3 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 27 Sep 2022 23:03:00 +0800 Subject: [PATCH 15/23] [to #42322933] fix bug: checkpoint hook and bestckpthook exists at the same time Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10227608 --- modelscope/trainers/default_config.py | 19 +++++++++++++++++++ modelscope/trainers/trainer.py | 7 ++----- tests/trainers/hooks/test_checkpoint_hook.py | 3 --- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py index 69fdd400..c8f0c7b0 100644 --- a/modelscope/trainers/default_config.py +++ b/modelscope/trainers/default_config.py @@ -1,4 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. + +from modelscope.utils.config import Config + DEFAULT_CONFIG = { 'train': { 'hooks': [{ @@ -12,3 +15,19 @@ DEFAULT_CONFIG = { }] } } + + +def merge_cfg(cfg: Config): + """Merge the default config into the input cfg. + + This function will pop the default CheckpointHook when the BestCkptSaverHook exists in the input cfg. + + @param cfg: The input cfg to be merged into. + """ + cfg.merge_from_dict(DEFAULT_CONFIG, force=False) + # pop duplicate hook + + if any(['BestCkptSaverHook' == hook['type'] for hook in cfg.train.hooks]): + cfg.train.hooks = list( + filter(lambda hook: hook['type'] != 'CheckpointHook', + cfg.train.hooks)) diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index d3675720..a01d9b59 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -41,7 +41,7 @@ from modelscope.utils.torch_utils import (get_dist_info, get_local_rank, init_dist, set_random_seed) from .base import BaseTrainer from .builder import TRAINERS -from .default_config import DEFAULT_CONFIG +from .default_config import merge_cfg from .hooks.hook import Hook from .parallel.builder import build_parallel from .parallel.utils import is_parallel @@ -114,7 +114,7 @@ class EpochBasedTrainer(BaseTrainer): super().__init__(cfg_file, arg_parse_fn) # add default config - self.cfg.merge_from_dict(self._get_default_config(), force=False) + merge_cfg(self.cfg) self.cfg = self.rebuild_config(self.cfg) if 'cfg_options' in kwargs: @@ -951,9 +951,6 @@ class EpochBasedTrainer(BaseTrainer): stage_hook_infos.append(info) return '\n'.join(stage_hook_infos) - def _get_default_config(self): - return DEFAULT_CONFIG - def worker_init_fn(worker_id, num_workers, rank, seed): # The seed of each worker equals to diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py index c694ece6..e7f2d33c 100644 --- a/tests/trainers/hooks/test_checkpoint_hook.py +++ b/tests/trainers/hooks/test_checkpoint_hook.py @@ -204,9 +204,6 @@ class BestCkptSaverHookTest(unittest.TestCase): trainer = build_trainer(trainer_name, kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) - self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) - self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) - self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) self.assertIn(f'best_{LogKeys.EPOCH}1_{MetricKeys.ACCURACY}0.1.pth', results_files) From 372adb3936939c0079924cd8a761e525b4fbd77f Mon Sep 17 00:00:00 2001 From: "tingwei.gtw" Date: Tue, 27 Sep 2022 23:04:38 +0800 Subject: [PATCH 16/23] [to #42322933] support hand-static model Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10244616 --- data/test/images/hand_static.jpg | 3 + modelscope/metainfo.py | 2 + modelscope/models/cv/hand_static/__init__.py | 20 + .../models/cv/hand_static/hand_model.py | 93 +++++ modelscope/models/cv/hand_static/networks.py | 358 ++++++++++++++++++ modelscope/outputs.py | 6 +- modelscope/pipelines/builder.py | 2 + modelscope/pipelines/cv/__init__.py | 4 +- .../pipelines/cv/hand_static_pipeline.py | 37 ++ modelscope/utils/constant.py | 1 + tests/pipelines/test_hand_static.py | 32 ++ 11 files changed, 556 insertions(+), 2 deletions(-) create mode 100644 data/test/images/hand_static.jpg create mode 100644 modelscope/models/cv/hand_static/__init__.py create mode 100644 modelscope/models/cv/hand_static/hand_model.py create mode 100644 modelscope/models/cv/hand_static/networks.py create mode 100644 modelscope/pipelines/cv/hand_static_pipeline.py create mode 100644 tests/pipelines/test_hand_static.py diff --git a/data/test/images/hand_static.jpg b/data/test/images/hand_static.jpg new file mode 100644 index 00000000..43ae28b1 --- /dev/null +++ b/data/test/images/hand_static.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94b8e281d77ee6d3ea2a8a0c9408ecdbd29fe75f33ea5399b6ea00070ba77bd6 +size 13090 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 29a35fbe..5870ebe3 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -39,6 +39,7 @@ class Models(object): mtcnn = 'mtcnn' ulfd = 'ulfd' video_inpainting = 'video-inpainting' + hand_static = 'hand-static' # EasyCV models yolox = 'YOLOX' @@ -173,6 +174,7 @@ class Pipelines(object): movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' shop_segmentation = 'shop-segmentation' video_inpainting = 'video-inpainting' + hand_static = 'hand-static' # nlp tasks sentence_similarity = 'sentence-similarity' diff --git a/modelscope/models/cv/hand_static/__init__.py b/modelscope/models/cv/hand_static/__init__.py new file mode 100644 index 00000000..654d2acb --- /dev/null +++ b/modelscope/models/cv/hand_static/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .hand_model import HandStatic + +else: + _import_structure = {'hand_model': ['HandStatic']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/hand_static/hand_model.py b/modelscope/models/cv/hand_static/hand_model.py new file mode 100644 index 00000000..38517307 --- /dev/null +++ b/modelscope/models/cv/hand_static/hand_model.py @@ -0,0 +1,93 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. +import os +import sys + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image +from torch import nn +from torchvision.transforms import transforms + +from modelscope.metainfo import Models +from modelscope.models.base import TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger +from .networks import StaticGestureNet + +logger = get_logger() + +map_idx = { + 0: 'unrecog', + 1: 'one', + 2: 'two', + 3: 'bixin', + 4: 'yaogun', + 5: 'zan', + 6: 'fist', + 7: 'ok', + 8: 'tuoju', + 9: 'd_bixin', + 10: 'd_fist_left', + 11: 'd_fist_right', + 12: 'd_hand', + 13: 'fashe', + 14: 'five', + 15: 'nohand' +} + +img_size = [112, 112] + +spatial_transform = transforms.Compose([ + transforms.Resize(img_size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) +]) + + +@MODELS.register_module(Tasks.hand_static, module_name=Models.hand_static) +class HandStatic(TorchModel): + + def __init__(self, model_dir, device_id=0, *args, **kwargs): + + super().__init__( + model_dir=model_dir, device_id=device_id, *args, **kwargs) + + self.model = StaticGestureNet() + if torch.cuda.is_available(): + self.device = 'cuda' + else: + self.device = 'cpu' + self.params = torch.load( + '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), + map_location=self.device) + + self.model.load_state_dict(self.params) + self.model.to(self.device) + self.model.eval() + self.device_id = device_id + if self.device_id >= 0 and self.device == 'cuda': + self.model.to('cuda:{}'.format(self.device_id)) + logger.info('Use GPU: {}'.format(self.device_id)) + else: + self.device_id = -1 + logger.info('Use CPU for inference') + + def forward(self, x): + pred_result = self.model(x) + return pred_result + + +def infer(img_path, model, device): + + img = Image.open(img_path) + clip = spatial_transform(img) + clip = clip.unsqueeze(0).to(device).float() + outputs = model(clip) + predicted = int(outputs.max(1)[1]) + pred_result = map_idx.get(predicted) + logger.info('pred result: {}'.format(pred_result)) + + return pred_result diff --git a/modelscope/models/cv/hand_static/networks.py b/modelscope/models/cv/hand_static/networks.py new file mode 100644 index 00000000..6cf46f5d --- /dev/null +++ b/modelscope/models/cv/hand_static/networks.py @@ -0,0 +1,358 @@ +""" HandStatic +The implementation here is modified based on MobileFaceNet, +originally Apache 2.0 License and publicly avaialbe at https://github.com/xuexingyu24/MobileFaceNet_Tutorial_Pytorch +""" + +import os + +import torch +import torch.nn as nn +import torchvision +import torchvision.models as models +from torch.nn import (AdaptiveAvgPool2d, BatchNorm1d, BatchNorm2d, Conv2d, + Dropout, Linear, MaxPool2d, Module, PReLU, ReLU, + Sequential, Sigmoid) + + +class StaticGestureNet(torch.nn.Module): + + def __init__(self, train=True): + super().__init__() + + model = MobileFaceNet(512) + self.feature_extractor = model + self.fc_layer = torch.nn.Sequential( + nn.Linear(512, 128), nn.Softplus(), nn.Linear(128, 15)) + self.sigmoid = nn.Sigmoid() + + def forward(self, inputs): + out = self.feature_extractor(inputs) + out = self.fc_layer(out) + out = self.sigmoid(out) + return out + + +class Flatten(Module): + + def forward(self, input): + return input.view(input.size(0), -1) + + +def l2_norm(input, axis=1): + norm = torch.norm(input, 2, axis, True) + output = torch.div(input, norm) + return output + + +class SEModule(Module): + + def __init__(self, channels, reduction): + super(SEModule, self).__init__() + self.avg_pool = AdaptiveAvgPool2d(1) + self.fc1 = Conv2d( + channels, + channels // reduction, + kernel_size=1, + padding=0, + bias=False) + self.relu = ReLU(inplace=True) + self.fc2 = Conv2d( + channels // reduction, + channels, + kernel_size=1, + padding=0, + bias=False) + self.sigmoid = Sigmoid() + + def forward(self, x): + module_input = x + x = self.avg_pool(x) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.sigmoid(x) + return module_input * x + + +class BottleneckIR(Module): + + def __init__(self, in_channel, depth, stride): + super(BottleneckIR, self).__init__() + if in_channel == depth: + self.shortcut_layer = MaxPool2d(1, stride) + else: + self.shortcut_layer = Sequential( + Conv2d(in_channel, depth, (1, 1), stride, bias=False), + BatchNorm2d(depth)) + self.res_layer = Sequential( + BatchNorm2d(in_channel), + Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), + PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False), + BatchNorm2d(depth)) + + def forward(self, x): + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + return res + shortcut + + +class BottleneckIRSE(Module): + + def __init__(self, in_channel, depth, stride): + super(BottleneckIRSE, self).__init__() + if in_channel == depth: + self.shortcut_layer = MaxPool2d(1, stride) + else: + self.shortcut_layer = Sequential( + Conv2d(in_channel, depth, (1, 1), stride, bias=False), + BatchNorm2d(depth)) + self.res_layer = Sequential( + BatchNorm2d(in_channel), + Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), + PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False), + BatchNorm2d(depth), SEModule(depth, 16)) + + def forward(self, x): + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + return res + shortcut + + +def get_block(in_channel, depth, num_units, stride=2): + return [Bottleneck(in_channel, depth, stride) + ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)] + + +def get_blocks(num_layers): + if num_layers == 50: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=4), + get_block(in_channel=128, depth=256, num_units=14), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 100: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=13), + get_block(in_channel=128, depth=256, num_units=30), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 152: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=8), + get_block(in_channel=128, depth=256, num_units=36), + get_block(in_channel=256, depth=512, num_units=3) + ] + return blocks + + +class Backbone(Module): + + def __init__(self, num_layers, drop_ratio, mode='ir'): + super(Backbone, self).__init__() + assert num_layers in [50, 100, + 152], 'num_layers should be 50,100, or 152' + assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se' + blocks = get_blocks(num_layers) + if mode == 'ir': + unit_module = BottleneckIR + elif mode == 'ir_se': + unit_module = BottleneckIRSE + self.input_layer = Sequential( + Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64), + PReLU(64)) + self.output_layer = Sequential( + BatchNorm2d(512), Dropout(drop_ratio), Flatten(), + Linear(512 * 7 * 7, 512), BatchNorm1d(512)) + modules = [] + for block in blocks: + for bottleneck in block: + modules.append( + unit_module(bottleneck.in_channel, bottleneck.depth, + bottleneck.stride)) + self.body = Sequential(*modules) + + def forward(self, x): + x = self.input_layer(x) + x = self.body(x) + x = self.output_layer(x) + return l2_norm(x) + + +class ConvBlock(Module): + + def __init__(self, + in_c, + out_c, + kernel=(1, 1), + stride=(1, 1), + padding=(0, 0), + groups=1): + super(ConvBlock, self).__init__() + self.conv = Conv2d( + in_c, + out_channels=out_c, + kernel_size=kernel, + groups=groups, + stride=stride, + padding=padding, + bias=False) + self.bn = BatchNorm2d(out_c) + self.prelu = PReLU(out_c) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.prelu(x) + return x + + +class LinearBlock(Module): + + def __init__(self, + in_c, + out_c, + kernel=(1, 1), + stride=(1, 1), + padding=(0, 0), + groups=1): + super(LinearBlock, self).__init__() + self.conv = Conv2d( + in_c, + out_channels=out_c, + kernel_size=kernel, + groups=groups, + stride=stride, + padding=padding, + bias=False) + self.bn = BatchNorm2d(out_c) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class DepthWise(Module): + + def __init__(self, + in_c, + out_c, + residual=False, + kernel=(3, 3), + stride=(2, 2), + padding=(1, 1), + groups=1): + super(DepthWise, self).__init__() + self.conv = ConvBlock( + in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)) + self.conv_dw = ConvBlock( + groups, + groups, + groups=groups, + kernel=kernel, + padding=padding, + stride=stride) + self.project = LinearBlock( + groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1)) + self.residual = residual + + def forward(self, x): + if self.residual: + short_cut = x + x = self.conv(x) + x = self.conv_dw(x) + x = self.project(x) + if self.residual: + output = short_cut + x + else: + output = x + return output + + +class Residual(Module): + + def __init__(self, + c, + num_block, + groups, + kernel=(3, 3), + stride=(1, 1), + padding=(1, 1)): + super(Residual, self).__init__() + modules = [] + for _ in range(num_block): + modules.append( + DepthWise( + c, + c, + residual=True, + kernel=kernel, + padding=padding, + stride=stride, + groups=groups)) + self.model = Sequential(*modules) + + def forward(self, x): + return self.model(x) + + +class MobileFaceNet(Module): + + def __init__(self, embedding_size): + super(MobileFaceNet, self).__init__() + self.conv1 = ConvBlock( + 3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1)) + self.conv2_dw = ConvBlock( + 64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64) + self.conv_23 = DepthWise( + 64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128) + self.conv_3 = Residual( + 64, + num_block=4, + groups=128, + kernel=(3, 3), + stride=(1, 1), + padding=(1, 1)) + self.conv_34 = DepthWise( + 64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256) + self.conv_4 = Residual( + 128, + num_block=6, + groups=256, + kernel=(3, 3), + stride=(1, 1), + padding=(1, 1)) + self.conv_45 = DepthWise( + 128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512) + self.conv_5 = Residual( + 128, + num_block=2, + groups=256, + kernel=(3, 3), + stride=(1, 1), + padding=(1, 1)) + self.conv_6_sep = ConvBlock( + 128, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0)) + self.conv_6_dw = LinearBlock( + 512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0)) + self.conv_6_flatten = Flatten() + self.linear = Linear(512, embedding_size, bias=False) + self.bn = BatchNorm1d(embedding_size) + + def forward(self, x): + out = self.conv1(x) + out = self.conv2_dw(out) + out = self.conv_23(out) + out = self.conv_3(out) + out = self.conv_34(out) + out = self.conv_4(out) + out = self.conv_45(out) + out = self.conv_5(out) + out = self.conv_6_sep(out) + out = self.conv_6_dw(out) + out = self.conv_6_flatten(out) + out = self.linear(out) + return l2_norm(out) diff --git a/modelscope/outputs.py b/modelscope/outputs.py index b96f38d3..ce9e8d07 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -632,5 +632,9 @@ TASK_OUTPUTS = { # { # 'output': ['Done' / 'Decode_Error'] # } - Tasks.video_inpainting: [OutputKeys.OUTPUT] + Tasks.video_inpainting: [OutputKeys.OUTPUT], + # { + # 'output': ['bixin'] + # } + Tasks.hand_static: [OutputKeys.OUTPUT] } diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 5e244b27..51d50d51 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -178,6 +178,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_vitb16_segmentation_shop-seg'), Tasks.video_inpainting: (Pipelines.video_inpainting, 'damo/cv_video-inpainting'), + Tasks.hand_static: (Pipelines.hand_static, + 'damo/cv_mobileface_hand-static'), } diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index a9dc05f2..55bad09a 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -52,7 +52,8 @@ if TYPE_CHECKING: from .ulfd_face_detection_pipeline import UlfdFaceDetectionPipeline from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline - from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipeline + from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipelin + from .hand_static_pipeline import HandStaticPipeline else: _import_structure = { @@ -119,6 +120,7 @@ else: 'facial_expression_recognition_pipelin': ['FacialExpressionRecognitionPipeline'], 'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'], + 'hand_static_pipeline': ['HandStaticPipeline'], } import sys diff --git a/modelscope/pipelines/cv/hand_static_pipeline.py b/modelscope/pipelines/cv/hand_static_pipeline.py new file mode 100644 index 00000000..1219c873 --- /dev/null +++ b/modelscope/pipelines/cv/hand_static_pipeline.py @@ -0,0 +1,37 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. +from typing import Any, Dict + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.hand_static import hand_model +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.hand_static, module_name=Pipelines.hand_static) +class HandStaticPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create hand static pipeline for prediction + Args: + model: model id on modelscope hub. + """ + + super().__init__(model=model, **kwargs) + logger.info('load model done') + + def preprocess(self, input: Input) -> Dict[str, Any]: + return input + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + result = hand_model.infer(input['img_path'], self.model, self.device) + return {OutputKeys.OUTPUT: result} + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index de3d933f..75add1d9 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -42,6 +42,7 @@ class CVTasks(object): portrait_matting = 'portrait-matting' text_driven_segmentation = 'text-driven-segmentation' shop_segmentation = 'shop-segmentation' + hand_static = 'hand-static' # image editing skin_retouching = 'skin-retouching' diff --git a/tests/pipelines/test_hand_static.py b/tests/pipelines/test_hand_static.py new file mode 100644 index 00000000..37181899 --- /dev/null +++ b/tests/pipelines/test_hand_static.py @@ -0,0 +1,32 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. +import unittest + +from modelscope.pipelines import pipeline +from modelscope.pipelines.base import Pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class HandStaticTest(unittest.TestCase): + + def setUp(self) -> None: + self.model = 'damo/cv_mobileface_hand-static' + self.input = {'img_path': 'data/test/images/hand_static.jpg'} + + def pipeline_inference(self, pipeline: Pipeline, input: str): + result = pipeline(input) + print(result) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_modelhub(self): + hand_static = pipeline(Tasks.hand_static, model=self.model) + self.pipeline_inference(hand_static, self.input) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_modelhub_default_model(self): + hand_static = pipeline(Tasks.hand_static) + self.pipeline_inference(hand_static, self.input) + + +if __name__ == '__main__': + unittest.main() From d721fabb343c9bfe8721464dee5d4dd30d634e26 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Tue, 27 Sep 2022 23:08:33 +0800 Subject: [PATCH 17/23] [to #42322933]bert with sequence classification / token classification/ fill mask refactor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.新增支持原始bert模型(非easynlp的 backbone prefix版本) 2.支持bert的在sequence classification/fill mask /token classification上的backbone head形式 3.统一了sequence classification几个任务的pipeline到一个类 4.fill mask 支持backbone head形式 5.token classification的几个子任务(ner,word seg, part of speech)的preprocessor 统一到了一起TokenClassificationPreprocessor 6. sequence classification的几个子任务(single classification, pair classification)的preprocessor 统一到了一起SequenceClassificationPreprocessor 7. 改动register中 cls的group_key 赋值位置,之前的group_key在多个decorators的情况下,会被覆盖,obj_cls的group_key信息不正确 8. 基于backbone head形式将 原本group_key和 module同名的情况尝试做调整,如下在modelscope/pipelines/nlp/sequence_classification_pipeline.py 中 原本 @PIPELINES.register_module( Tasks.sentiment_classification, module_name=Pipelines.sentiment_classification) 改成 @PIPELINES.register_module( Tasks.text_classification, module_name=Pipelines.sentiment_classification) 相应的configuration.json也有改动,这样的改动更符合任务和pipline(子任务)的关系。 8. 其他相应改动为支持上述功能 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10041463 --- modelscope/metainfo.py | 11 +- modelscope/models/builder.py | 9 +- modelscope/models/nlp/__init__.py | 22 +- modelscope/models/nlp/backbones/bert.py | 7 + modelscope/models/nlp/bert/__init__.py | 60 + .../models/nlp/bert/configuration_bert.py | 162 ++ modelscope/models/nlp/bert/modeling_bert.py | 2040 +++++++++++++++++ .../nlp/bert_for_sequence_classification.py | 70 - modelscope/models/nlp/deberta_v2/__init__.py | 10 - modelscope/models/nlp/heads/fill_mask_head.py | 101 + .../models/nlp/heads/torch_pretrain_head.py | 2 +- modelscope/models/nlp/masked_language.py | 5 +- .../nlp/nncrf_for_named_entity_recognition.py | 9 +- .../models/nlp/sequence_classification.py | 83 +- modelscope/models/nlp/task_models/__init__.py | 4 + .../nlp/task_models/feature_extraction.py | 43 + .../models/nlp/task_models/fill_mask.py | 47 + .../nlp/task_models/information_extraction.py | 15 +- .../task_models/sequence_classification.py | 49 +- .../models/nlp/task_models/task_model.py | 29 +- .../nlp/task_models/token_classification.py | 15 +- modelscope/models/nlp/token_classification.py | 49 +- modelscope/outputs.py | 16 + modelscope/pipelines/builder.py | 7 +- modelscope/pipelines/nlp/__init__.py | 19 +- .../nlp/feature_extraction_pipeline.py | 82 + .../pipelines/nlp/fill_mask_pipeline.py | 9 +- .../nlp/information_extraction_pipeline.py | 2 +- .../nlp/named_entity_recognition_pipeline.py | 5 +- .../pair_sentence_classification_pipeline.py | 59 - .../nlp/sequence_classification_pipeline.py | 72 +- .../sequence_classification_pipeline_base.py | 62 - ...single_sentence_classification_pipeline.py | 56 - .../nlp/token_classification_pipeline.py | 2 +- modelscope/preprocessors/__init__.py | 48 +- modelscope/preprocessors/nlp/__init__.py | 45 +- modelscope/preprocessors/nlp/nlp_base.py | 575 ++--- modelscope/utils/constant.py | 1 + modelscope/utils/registry.py | 2 +- tests/msdatasets/test_ms_dataset.py | 3 +- tests/pipelines/test_deberta_tasks.py | 8 +- tests/pipelines/test_feature_extraction.py | 67 + tests/pipelines/test_fill_mask.py | 49 +- .../test_named_entity_recognition.py | 10 +- tests/pipelines/test_nli.py | 10 +- tests/pipelines/test_sentence_similarity.py | 10 +- .../test_sentiment_classification.py | 31 +- tests/pipelines/test_text_classification.py | 4 +- tests/preprocessors/test_nlp.py | 76 + tests/utils/test_ast.py | 12 +- 50 files changed, 3347 insertions(+), 837 deletions(-) create mode 100644 modelscope/models/nlp/backbones/bert.py create mode 100644 modelscope/models/nlp/bert/__init__.py create mode 100644 modelscope/models/nlp/bert/configuration_bert.py create mode 100755 modelscope/models/nlp/bert/modeling_bert.py delete mode 100644 modelscope/models/nlp/bert_for_sequence_classification.py create mode 100644 modelscope/models/nlp/heads/fill_mask_head.py create mode 100644 modelscope/models/nlp/task_models/feature_extraction.py create mode 100644 modelscope/models/nlp/task_models/fill_mask.py create mode 100644 modelscope/pipelines/nlp/feature_extraction_pipeline.py delete mode 100644 modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py delete mode 100644 modelscope/pipelines/nlp/sequence_classification_pipeline_base.py delete mode 100644 modelscope/pipelines/nlp/single_sentence_classification_pipeline.py create mode 100644 tests/pipelines/test_feature_extraction.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 5870ebe3..a1cf5e06 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -91,17 +91,22 @@ class TaskModels(object): text_classification = 'text-classification' token_classification = 'token-classification' information_extraction = 'information-extraction' + fill_mask = 'fill-mask' + feature_extraction = 'feature-extraction' class Heads(object): # nlp heads + + # text cls text_classification = 'text-classification' - # mlm + # fill mask + fill_mask = 'fill-mask' bert_mlm = 'bert-mlm' - # roberta mlm roberta_mlm = 'roberta-mlm' # token cls token_classification = 'token-classification' + # extraction information_extraction = 'information-extraction' @@ -203,6 +208,7 @@ class Pipelines(object): passage_ranking = 'passage-ranking' relation_extraction = 'relation-extraction' document_segmentation = 'document-segmentation' + feature_extraction = 'feature-extraction' # audio tasks sambert_hifigan_tts = 'sambert-hifigan-tts' @@ -306,6 +312,7 @@ class Preprocessors(object): table_question_answering_preprocessor = 'table-question-answering-preprocessor' re_tokenizer = 're-tokenizer' document_segmentation = 'document-segmentation' + feature_extraction = 'feature-extraction' # audio preprocessor linear_aec_fbank = 'linear-aec-fbank' diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py index 33f111a8..7a8e28f4 100644 --- a/modelscope/models/builder.py +++ b/modelscope/models/builder.py @@ -37,13 +37,16 @@ def build_backbone(cfg: ConfigDict, cfg, BACKBONES, group_key=field, default_args=default_args) -def build_head(cfg: ConfigDict, default_args: dict = None): +def build_head(cfg: ConfigDict, + group_key: str = None, + default_args: dict = None): """ build head given config dict Args: cfg (:obj:`ConfigDict`): config dict for head object. default_args (dict, optional): Default initialization arguments. """ - + if group_key is None: + group_key = cfg[TYPE_NAME] return build_from_cfg( - cfg, HEADS, group_key=cfg[TYPE_NAME], default_args=default_args) + cfg, HEADS, group_key=group_key, default_args=default_args) diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 152a32dc..8ef96365 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -6,7 +6,6 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .backbones import SbertModel from .bart_for_text_error_correction import BartForTextErrorCorrection - from .bert_for_sequence_classification import BertForSequenceClassification from .bert_for_document_segmentation import BertForDocumentSegmentation from .csanmt_for_translation import CsanmtForTranslation from .heads import SequenceClassificationHead @@ -20,12 +19,15 @@ if TYPE_CHECKING: from .palm_v2 import PalmForTextGeneration from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering from .star_text_to_sql import StarForTextToSql - from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification + from .sequence_classification import (VecoForSequenceClassification, + SbertForSequenceClassification, + BertForSequenceClassification) from .space import SpaceForDialogIntent from .space import SpaceForDialogModeling from .space import SpaceForDialogStateTracking from .table_question_answering import TableQuestionAnswering - from .task_models import (InformationExtractionModel, + from .task_models import (FeatureExtractionModel, + InformationExtractionModel, SequenceClassificationModel, SingleBackboneTaskModelBase, TokenClassificationModel) @@ -37,7 +39,6 @@ else: _import_structure = { 'backbones': ['SbertModel'], 'bart_for_text_error_correction': ['BartForTextErrorCorrection'], - 'bert_for_sequence_classification': ['BertForSequenceClassification'], 'bert_for_document_segmentation': ['BertForDocumentSegmentation'], 'csanmt_for_translation': ['CsanmtForTranslation'], 'heads': ['SequenceClassificationHead'], @@ -54,15 +55,20 @@ else: 'palm_v2': ['PalmForTextGeneration'], 'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'], 'star_text_to_sql': ['StarForTextToSql'], - 'sequence_classification': - ['VecoForSequenceClassification', 'SbertForSequenceClassification'], + 'sequence_classification': [ + 'VecoForSequenceClassification', 'SbertForSequenceClassification', + 'BertForSequenceClassification' + ], 'space': [ 'SpaceForDialogIntent', 'SpaceForDialogModeling', 'SpaceForDialogStateTracking' ], 'task_models': [ - 'InformationExtractionModel', 'SequenceClassificationModel', - 'SingleBackboneTaskModelBase', 'TokenClassificationModel' + 'FeatureExtractionModel', + 'InformationExtractionModel', + 'SequenceClassificationModel', + 'SingleBackboneTaskModelBase', + 'TokenClassificationModel', ], 'token_classification': ['SbertForTokenClassification'], 'table_question_answering': ['TableQuestionAnswering'], diff --git a/modelscope/models/nlp/backbones/bert.py b/modelscope/models/nlp/backbones/bert.py new file mode 100644 index 00000000..aa513944 --- /dev/null +++ b/modelscope/models/nlp/backbones/bert.py @@ -0,0 +1,7 @@ +from modelscope.metainfo import Models +from modelscope.models.builder import BACKBONES +from modelscope.models.nlp.bert import BertModel +from modelscope.utils.constant import Fields + +BACKBONES.register_module( + group_key=Fields.nlp, module_name=Models.bert, module_cls=BertModel) diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py new file mode 100644 index 00000000..705d9519 --- /dev/null +++ b/modelscope/models/nlp/bert/__init__.py @@ -0,0 +1,60 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .modeling_bert import ( + BERT_PRETRAINED_MODEL_ARCHIVE_LIST, + BertForMaskedLM, + BertForMultipleChoice, + BertForNextSentencePrediction, + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BertForTokenClassification, + BertLayer, + BertLMHeadModel, + BertModel, + BertPreTrainedModel, + load_tf_weights_in_bert, + ) + + from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig + from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer + from .tokenization_bert_fast import BertTokenizerFast + +else: + _import_structure = { + 'configuration_bert': + ['BERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'BertConfig', 'BertOnnxConfig'], + 'tokenization_bert': + ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer'], + } + _import_structure['tokenization_bert_fast'] = ['BertTokenizerFast'] + + _import_structure['modeling_bert'] = [ + 'BERT_PRETRAINED_MODEL_ARCHIVE_LIST', + 'BertForMaskedLM', + 'BertForMultipleChoice', + 'BertForNextSentencePrediction', + 'BertForPreTraining', + 'BertForQuestionAnswering', + 'BertForSequenceClassification', + 'BertForTokenClassification', + 'BertLayer', + 'BertLMHeadModel', + 'BertModel', + 'BertPreTrainedModel', + 'load_tf_weights_in_bert', + ] + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/bert/configuration_bert.py b/modelscope/models/nlp/bert/configuration_bert.py new file mode 100644 index 00000000..2c9293ec --- /dev/null +++ b/modelscope/models/nlp/bert/configuration_bert.py @@ -0,0 +1,162 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BERT model configuration """ +from collections import OrderedDict +from typing import Mapping + +from transformers.configuration_utils import PretrainedConfig +from transformers.onnx import OnnxConfig + +from modelscope.utils.logger import get_logger + +logger = get_logger(__name__) + + +class BertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a + [`BertModel`] or a [`TFBertModel`]. It is used to instantiate a BERT model + according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar + configuration to that of the BERT + [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to + control the model outputs. Read the documentation from [`PretrainedConfig`] + for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different + tokens that can be represented by the `inputs_ids` passed when + calling [`BertModel`] or [`TFBertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the + Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) + layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the + encoder and pooler. If string, `"gelu"`, `"relu"`, `"silu"` and + `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the + embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or + 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling + [`BertModel`] or [`TFBertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, + `"relative_key"`, `"relative_key_query"`. For positional embeddings + use `"absolute"`. For more information on `"relative_key"`, please + refer to [Self-Attention with Relative Position Representations + (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more + information on `"relative_key_query"`, please refer to *Method 4* in + [Improve Transformer Models with Better Relative Position Embeddings + (Huang et al.)](https://arxiv.org/abs/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values + attentions (not used by all models). Only relevant if + `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python >>> from transformers import BertModel, BertConfig + + >>> # Initializing a BERT bert-base-uncased style configuration + >>> configuration = BertConfig() + + >>> # Initializing a model from the bert-base-uncased style configuration + >>> model = BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = 'bert' + + def __init__(self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type='absolute', + use_cache=True, + classifier_dropout=None, + **kwargs): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class BertOnnxConfig(OnnxConfig): + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict([ + ('input_ids', { + 0: 'batch', + 1: 'sequence' + }), + ('attention_mask', { + 0: 'batch', + 1: 'sequence' + }), + ('token_type_ids', { + 0: 'batch', + 1: 'sequence' + }), + ]) diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py new file mode 100755 index 00000000..f8fd5994 --- /dev/null +++ b/modelscope/models/nlp/bert/modeling_bert.py @@ -0,0 +1,2040 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model. """ + +import math +import os +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.utils.checkpoint +from packaging import version +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.file_utils import (ModelOutput, add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings) +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, MaskedLMOutput, + MultipleChoiceModelOutput, NextSentencePredictorOutput, + QuestionAnsweringModelOutput, SequenceClassifierOutput, + TokenClassifierOutput) +from transformers.modeling_utils import (PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer) + +from modelscope.models.base import TorchModel +from modelscope.utils.logger import get_logger +from .configuration_bert import BertConfig + +logger = get_logger(__name__) + +_CONFIG_FOR_DOC = 'BertConfig' + + +def load_tf_weights_in_bert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + 'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see ' + 'https://www.tensorflow.org/install/ for installation instructions.' + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f'Converting TensorFlow checkpoint from {tf_path}') + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f'Loading TF weight {name} with shape {shape}') + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in [ + 'adam_v', 'adam_m', 'AdamWeightDecayOptimizer', + 'AdamWeightDecayOptimizer_1', 'global_step' + ] for n in name): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + scope_names = re.split(r'_(\d+)', m_name) + else: + scope_names = [m_name] + if scope_names[0] == 'kernel' or scope_names[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif scope_names[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + elif scope_names[0] == 'squad': + pointer = getattr(pointer, 'classifier') + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError( + f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched' + ) + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f'Initialize PyTorch weight {name}') + pointer.data = torch.from_numpy(array) + return model + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, + config.hidden_size, + padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model + # variable name and be able to load any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and + # exported when serialized + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + self.register_buffer( + 'position_ids', + torch.arange(config.max_position_embeddings).expand((1, -1))) + if version.parse(torch.__version__) > version.parse('1.6.0'): + self.register_buffer( + 'token_type_ids', + torch.zeros(self.position_ids.size(), dtype=torch.long), + persistent=False, + ) + + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, + past_key_values_length:seq_length + + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor + # where it is all zeros, which usually occurs when its auto-generated, + # registered buffer helps users when tracing the model without passing + # token_type_ids, solves issue #5664 + if token_type_ids is None: + if hasattr(self, 'token_type_ids'): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, + dtype=torch.long, + device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == 'absolute': + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, 'embedding_size'): + raise ValueError( + f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention ' + f'heads ({config.num_attention_heads})') + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size + / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, 'position_embedding_type', 'absolute') + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, + self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all + # cross attention key/value_states. Further calls to cross_attention + # layer can then reuse all cross-attention key/value_states (first + # "if" case) if uni-directional self-attention (decoder) save + # Tuple(torch.Tensor, torch.Tensor) of all previous decoder + # key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected + # key/value_states (third "elif" case) if encoder bi-directional + # self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + + if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query': + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == 'relative_key': + relative_position_scores = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == 'relative_key_query': + relative_position_scores_query = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + relative_position_scores_key = torch.einsum( + 'bhrd,lrd->bhlr', key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if output_attentions else (context_layer, ) + + if self.is_decoder: + outputs = outputs + (past_key_value, ) + return outputs + + +class BertSelfOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = BertSelfAttention( + config, position_embedding_type=position_embedding_type) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, + self.self.attention_head_size, self.pruned_heads) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len( + heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output, + ) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError( + f'{self} should be used as a decoder model if cross attention is added' + ) + self.crossattention = BertAttention( + config, position_embedding_type='absolute') + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[: + 2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[ + 1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, 'crossattention'): + raise ValueError( + f'If `encoder_hidden_states` are passed, {self} has to be instantiated ' + f'with cross-attention layers by setting `config.add_cross_attention=True`' + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[ + -2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[ + 1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward(self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output) + outputs = (layer_output, ) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value, ) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + ) if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[ + i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' + ) + use_cache = False + + def create_custom_forward(module): + + def custom_forward(*inputs): + return module(*inputs, past_key_value, + output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1], ) + if output_attentions: + all_self_attentions = all_self_attentions + ( + layer_outputs[1], ) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + ( + layer_outputs[2], ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + if not return_dict: + return tuple(v for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] if v is not None) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface + for downloading and loading pretrained models. + """ + + config_class = BertConfig + load_tf_weights = load_tf_weights_in_bert + base_model_prefix = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, BertEncoder): + module.gradient_checkpointing = value + + +@dataclass +class BertForPreTrainingOutput(ModelOutput): + """ + Output type of [`BertForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, + `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the + next sequence prediction (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, + sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each + vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, + 2)`): + Prediction scores of the next sequence prediction (classification) + head (scores of True/False continuation before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + + one for the output of each layer) of shape `(batch_size, + sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the + initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when + `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +BERT_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass + documentation for the generic methods the library implements for all its + model (such as downloading or saving, resizing the input embeddings, pruning + heads etc.) + + This model is also a PyTorch + [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) + subclass. Use it as a regular PyTorch Module and refer to the PyTorch + documentation for all matter related to general usage and behavior. + + Parameters: + config ([`BertConfig`]): Model configuration class with all the + parameters of the model. + Initializing with a config file does not load the weights associated + with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model + weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BertTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] + for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask + values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the + inputs. Indices are selected in `[0, 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position + embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, + num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask + values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, + *optional*): + Optionally, instead of passing `input_ids` you can choose to + directly pass an embedded representation. This is useful if you want + more control over how to convert `input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention + layers. See `attentions` under returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See + `hidden_states` under returned tensors for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a + plain tuple. +""" + + +@add_start_docstrings( + 'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.', + BERT_START_DOCSTRING, +) +class BertModel(BertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a + decoder, in which case a layer of cross-attention is added between the + self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam + Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the + `is_decoder` argument of the configuration set to `True`. To be used in a + Seq2Seq model, the model needs to initialized with both `is_decoder` + argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` + is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + @classmethod + def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config): + config = BertConfig(**config) + model = cls(config, add_pooling_layer) + return model + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the + encoder. Used in the cross-attention if the model is configured as a + decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, + sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of + the encoder input. This mask is used in the cross-attention if the + model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length + `config.n_layers` with each tuple having 4 tensors of shape + `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention + blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only + the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead + of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned + and can be used to speed up decoding (see `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + 'You cannot specify both input_ids and inputs_embeds at the same time' + ) + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError( + 'You have to specify either input_ids or inputs_embeds') + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[ + 2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), + device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, 'token_type_ids'): + buffered_token_type_ids = self.embeddings.token_type_ids[:, : + seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand( + batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( + ) + encoder_hidden_shape = (encoder_batch_size, + encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, + self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler( + sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def extract_sequence_outputs(self, outputs): + return outputs['last_hidden_state'] + + def extract_pooled_outputs(self, outputs): + return outputs['pooler_output'] + + +@add_start_docstrings( + """ + Bert Model with two heads on top as done during the pretraining: a `masked + language modeling` head and a `next sentence prediction (classification)` + head. + """, + BERT_START_DOCSTRING, +) +class BertForPreTraining(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + next_sentence_label=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, + *optional*): + Labels for computing the masked language modeling loss. Indices + should be in `[-100, 0, ..., config.vocab_size]` (see + `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with + labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, + *optional*): + Labels for computing the next sequence prediction + (classification) loss. Input should be a sequence pair (see + `input_ids` docstring) Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (`Dict[str, any]`, optional, defaults to *{}*): + Used to hide legacy arguments that have been deprecated. + + Returns: + + Example: + + ```python >>> from transformers import BertTokenizer, BertForPreTraining + >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> model = BertForPreTraining.from_pretrained('bert-base-uncased') + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls( + sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + next_sentence_loss = loss_fct( + seq_relationship_score.view(-1, 2), + next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss, ) + + output) if total_loss is not None else output + + return BertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, + BERT_START_DOCSTRING) +class BertLMHeadModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning( + 'If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`' + ) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, + sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the + encoder. Used in the cross-attention if the model is configured + as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, + sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices + of the encoder input. This mask is used in the cross-attention + if the model is configured as a decoder. Mask values selected in + `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, + *optional*): + Labels for computing the left-to-right language modeling loss + (next word prediction). Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with + indices set to `-100` are ignored (masked), the loss is only + computed for the tokens with labels n `[0, ..., + config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length + `config.n_layers` with each tuple having 4 tensors of shape + `(batch_size, num_heads, sequence_length - 1, + embed_size_per_head)`): + Contains precomputed key and value hidden states of the + attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input + only the last `decoder_input_ids` (those that don't have their + past key value states given to this model) of shape + `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are + returned and can be used to speed up decoding (see + `past_key_values`). + + Returns: + + Example: + + ```python >>> from transformers import BertTokenizer, BertLMHeadModel, + BertConfig >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> config.is_decoder = True + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, : + -1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[2:] + return ((lm_loss, ) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + past=None, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'past_key_values': past + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple( + past_state.index_select(0, beam_idx) + for past_state in layer_past), ) + return reordered_past + + +@add_start_docstrings( + """Bert Model with a `language modeling` head on top. """, + BERT_START_DOCSTRING) +class BertForMaskedLM(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + 'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for ' + 'bi-directional self-attention.') + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, + *optional*): + Labels for computing the masked language modeling loss. Indices + should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` + docstring) Tokens with indices set to `-100` are ignored (masked), + the loss is only computed for the tokens with labels in `[0, ..., + config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[2:] + return ((masked_lm_loss, ) + + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, + input_ids, + attention_mask=None, + **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError('The PAD token should be defined for generation') + + padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1)) + attention_mask = torch.cat([attention_mask, padding_mask], dim=-1) + dummy_token = torch.full((effective_batch_size, 1), + self.config.pad_token_id, + dtype=torch.long, + device=input_ids.device) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {'input_ids': input_ids, 'attention_mask': attention_mask} + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top. """, + BERT_START_DOCSTRING, +) +class BertForNextSentencePrediction(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @replace_return_docstrings( + output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) + loss. Input should be a sequence pair (see `input_ids` docstring). + Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Returns: + + Example: + + ```python >>> from transformers import BertTokenizer, + BertForNextSentencePrediction >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt') + + >>> outputs = model(**encoding, labels=torch.LongTensor([1])) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + ``` + """ + + if 'next_sentence_label' in kwargs: + warnings.warn( + 'The `next_sentence_label` argument is deprecated, use `labels` instead.', + FutureWarning, + ) + labels = kwargs.pop('next_sentence_label') + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct( + seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores, ) + outputs[2:] + return ((next_sentence_loss, ) + + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model transformer with a sequence classification/regression head on top + (a linear layer on top of the pooled output) e.g. for GLUE tasks. + """, + BERT_START_DOCSTRING, +) +class BertForSequenceClassification(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. + Indices should be in `[0, ..., config.num_labels - 1]`. If + `config.num_labels == 1` a regression loss is computed (Mean-Square + loss), If `config.num_labels > 1` a classification loss is computed + (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = 'regression' + elif self.num_labels > 1 and (labels.dtype == torch.long + or labels.dtype == torch.int): + self.config.problem_type = 'single_label_classification' + else: + self.config.problem_type = 'multi_label_classification' + + if self.config.problem_type == 'regression': + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == 'single_label_classification': + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == 'multi_label_classification': + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits, ) + outputs[2:] + return ((loss, ) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a multiple choice classification head on top (a linear layer + on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. + """, + BERT_START_DOCSTRING, +) +class BertForMultipleChoice(BertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format( + 'batch_size, num_choices, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. + Indices should be in `[0, ..., num_choices-1]` where `num_choices` + is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[ + 1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view( + -1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view( + -1, + attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view( + -1, + token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view( + -1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), + inputs_embeds.size(-1)) + if inputs_embeds is not None else None) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits, ) + outputs[2:] + return ((loss, ) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. + """, + BERT_START_DOCSTRING, +) +class BertForTokenClassification(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None + else config.hidden_dropout_prob) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, + *optional*): + Labels for computing the token classification loss. Indices should + be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels)) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits, ) + outputs[2:] + return ((loss, ) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a span classification head on top for extractive + question-answering tasks like SQuAD (a linear layers on top of the + hidden-states output to compute `span start logits` and `span end logits`). + """, + BERT_START_DOCSTRING, +) +class BertForQuestionAnswering(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, + *optional*): + Labels for position (index) of the start of the labelled span for + computing the token classification loss. Positions are clamped to + the length of the sequence (`sequence_length`). Position outside of + the sequence are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for + computing the token classification loss. Positions are clamped to + the length of the sequence (`sequence_length`). Position outside of + the sequence are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss, ) + + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/modelscope/models/nlp/bert_for_sequence_classification.py b/modelscope/models/nlp/bert_for_sequence_classification.py deleted file mode 100644 index 2b1a3b3b..00000000 --- a/modelscope/models/nlp/bert_for_sequence_classification.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os -from typing import Any, Dict - -import json -import numpy as np - -from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.builder import MODELS -from modelscope.utils.constant import Tasks - -__all__ = ['BertForSequenceClassification'] - - -@MODELS.register_module(Tasks.text_classification, module_name=Models.bert) -class BertForSequenceClassification(TorchModel): - - def __init__(self, model_dir: str, *args, **kwargs): - # Model.__init__(self, model_dir, model_cls, first_sequence, *args, **kwargs) - # Predictor.__init__(self, *args, **kwargs) - """initialize the sequence classification model from the `model_dir` path. - - Args: - model_dir (str): the model path. - """ - - super().__init__(model_dir, *args, **kwargs) - import torch - from easynlp.appzoo import SequenceClassification - from easynlp.core.predictor import get_model_predictor - self.model = get_model_predictor( - model_dir=self.model_dir, - model_cls=SequenceClassification, - input_keys=[('input_ids', torch.LongTensor), - ('attention_mask', torch.LongTensor), - ('token_type_ids', torch.LongTensor)], - output_keys=['predictions', 'probabilities', 'logits']) - - self.label_path = os.path.join(self.model_dir, 'label_mapping.json') - with open(self.label_path) as f: - self.label_mapping = json.load(f) - self.id2label = {idx: name for name, idx in self.label_mapping.items()} - - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: - """return the result by the model - - Args: - input (Dict[str, Any]): the preprocessed data - - Returns: - Dict[str, np.ndarray]: results - Example: - { - 'predictions': array([1]), # lable 0-negative 1-positive - 'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32), - 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value - } - """ - return self.model.predict(input) - - def postprocess(self, inputs: Dict[str, np.ndarray], - **kwargs) -> Dict[str, np.ndarray]: - # N x num_classes - probs = inputs['probabilities'] - result = { - 'probs': probs, - } - - return result diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py index 664fc6c6..830210ed 100644 --- a/modelscope/models/nlp/deberta_v2/__init__.py +++ b/modelscope/models/nlp/deberta_v2/__init__.py @@ -21,21 +21,12 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule -_import_structure = { - 'configuration_deberta_v2': [ - 'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config', - 'DebertaV2OnnxConfig' - ], - 'tokenization_deberta_v2': ['DebertaV2Tokenizer'], -} - if TYPE_CHECKING: from .configuration_deberta_v2 import DebertaV2Config from .tokenization_deberta_v2 import DebertaV2Tokenizer from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast from .modeling_deberta_v2 import ( - DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST, DebertaV2ForMaskedLM, DebertaV2ForMultipleChoice, DebertaV2ForQuestionAnswering, @@ -55,7 +46,6 @@ else: 'DebertaV2TokenizerFast' ] _import_structure['modeling_deberta_v2'] = [ - 'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST', 'DebertaV2ForMaskedLM', 'DebertaV2ForMultipleChoice', 'DebertaV2ForQuestionAnswering', diff --git a/modelscope/models/nlp/heads/fill_mask_head.py b/modelscope/models/nlp/heads/fill_mask_head.py new file mode 100644 index 00000000..6b0c5e05 --- /dev/null +++ b/modelscope/models/nlp/heads/fill_mask_head.py @@ -0,0 +1,101 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers.activations import ACT2FN + +from modelscope.metainfo import Heads +from modelscope.models.base import TorchHead +from modelscope.models.builder import HEADS +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks + + +@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm) +class BertFillMaskHead(TorchHead): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.cls = BertOnlyMLMHead(self.config) + + def forward(self, sequence_output): + prediction_scores = self.cls(sequence_output) + return {OutputKeys.LOGITS: prediction_scores} + + def compute_loss(self, outputs: Dict[str, torch.Tensor], + labels) -> Dict[str, torch.Tensor]: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + outputs.view(-1, self.config.vocab_size), labels.view(-1)) + return {OutputKeys.LOSS: masked_lm_loss} + + +class BertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py index fb54637b..e477533f 100644 --- a/modelscope/models/nlp/heads/torch_pretrain_head.py +++ b/modelscope/models/nlp/heads/torch_pretrain_head.py @@ -11,7 +11,7 @@ from modelscope.models.builder import HEADS from modelscope.utils.constant import Tasks -@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm) +# @HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm) class BertMLMHead(BertOnlyMLMHead, TorchHead): def compute_loss(self, outputs: Dict[str, torch.Tensor], diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py index 514a04cd..b7a890c1 100644 --- a/modelscope/models/nlp/masked_language.py +++ b/modelscope/models/nlp/masked_language.py @@ -1,10 +1,9 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from transformers import BertForMaskedLM as BertForMaskedLMTransformer - from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import \ + BertForMaskedLM as BertForMaskedLMTransformer from modelscope.models.nlp.deberta_v2 import \ DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer from modelscope.models.nlp.structbert import SbertForMaskedLM diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py index 62198ed2..8b0c59b2 100644 --- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py +++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py @@ -41,12 +41,9 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel): def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: input_tensor = { - 'input_ids': - torch.tensor(input['input_ids']).unsqueeze(0), - 'attention_mask': - torch.tensor(input['attention_mask']).unsqueeze(0), - 'label_mask': - torch.tensor(input['label_mask'], dtype=torch.bool).unsqueeze(0) + 'input_ids': input['input_ids'], + 'attention_mask': input['attention_mask'], + 'label_mask': input['label_mask'], } output = { 'text': input['text'], diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py index a8930e68..156c615c 100644 --- a/modelscope/models/nlp/sequence_classification.py +++ b/modelscope/models/nlp/sequence_classification.py @@ -7,6 +7,7 @@ from torch import nn from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import BertPreTrainedModel from modelscope.models.nlp.structbert import SbertPreTrainedModel from modelscope.models.nlp.veco import \ VecoForSequenceClassification as VecoForSequenceClassificationTransform @@ -16,7 +17,10 @@ from modelscope.utils.hub import parse_label_mapping from modelscope.utils.tensor_utils import (torch_nested_detach, torch_nested_numpify) -__all__ = ['SbertForSequenceClassification', 'VecoForSequenceClassification'] +__all__ = [ + 'SbertForSequenceClassification', 'VecoForSequenceClassification', + 'BertForSequenceClassification' +] class SequenceClassificationBase(TorchModel): @@ -132,7 +136,7 @@ class SbertForSequenceClassification(SequenceClassificationBase, label2id = parse_label_mapping(model_dir) if label2id is not None and len(label2id) > 0: num_labels = len(label2id) - + cls.id2label = {id: label for label, id in label2id.items()} model_args = {} if num_labels is None else {'num_labels': num_labels} return super(SbertPreTrainedModel, SbertForSequenceClassification).from_pretrained( @@ -206,3 +210,78 @@ class VecoForSequenceClassification(TorchModel, pretrained_model_name_or_path=kwargs.get('model_dir'), model_dir=kwargs.get('model_dir'), **model_args) + + +@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert) +@MODELS.register_module( + Tasks.sentiment_classification, module_name=Models.bert) +@MODELS.register_module(Tasks.nli, module_name=Models.bert) +@MODELS.register_module(Tasks.text_classification, module_name=Models.bert) +class BertForSequenceClassification(SequenceClassificationBase, + BertPreTrainedModel): + """Bert sequence classification model. + + Inherited from SequenceClassificationBase. + """ + base_model_prefix: str = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def __init__(self, config, model_dir): + if hasattr(config, 'base_model_prefix'): + BertForSequenceClassification.base_model_prefix = config.base_model_prefix + super().__init__(config, model_dir) + + def build_base_model(self): + from .bert import BertModel + return BertModel(self.config, add_pooling_layer=True) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + @param kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels not supplied. + If num_labels is not found, the model will use the default setting (2 classes). + @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + + model_dir = kwargs.get('model_dir') + num_labels = kwargs.get('num_labels') + if num_labels is None: + label2id = parse_label_mapping(model_dir) + if label2id is not None and len(label2id) > 0: + num_labels = len(label2id) + + model_args = {} if num_labels is None else {'num_labels': num_labels} + return super(BertPreTrainedModel, + BertForSequenceClassification).from_pretrained( + pretrained_model_name_or_path=kwargs.get('model_dir'), + model_dir=kwargs.get('model_dir'), + **model_args) diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py index 7493ba74..90f22aa1 100644 --- a/modelscope/models/nlp/task_models/__init__.py +++ b/modelscope/models/nlp/task_models/__init__.py @@ -5,6 +5,8 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .information_extraction import InformationExtractionModel + from .feature_extraction import FeatureExtractionModel + from .fill_mask import FillMaskModel from .sequence_classification import SequenceClassificationModel from .task_model import SingleBackboneTaskModelBase from .token_classification import TokenClassificationModel @@ -12,6 +14,8 @@ if TYPE_CHECKING: else: _import_structure = { 'information_extraction': ['InformationExtractionModel'], + 'feature_extraction': ['FeatureExtractionModel'], + 'fill_mask': ['FillMaskModel'], 'sequence_classification': ['SequenceClassificationModel'], 'task_model': ['SingleBackboneTaskModelBase'], 'token_classification': ['TokenClassificationModel'], diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py new file mode 100644 index 00000000..069c37aa --- /dev/null +++ b/modelscope/models/nlp/task_models/feature_extraction.py @@ -0,0 +1,43 @@ +from typing import Any, Dict + +import numpy as np + +from modelscope.metainfo import TaskModels +from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import BertConfig +from modelscope.models.nlp.task_models.task_model import \ + SingleBackboneTaskModelBase +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping + +__all__ = ['FeatureExtractionModel'] + + +@MODELS.register_module( + Tasks.feature_extraction, module_name=TaskModels.feature_extraction) +class FeatureExtractionModel(SingleBackboneTaskModelBase): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the fill mask model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + if 'base_model_prefix' in kwargs: + self._base_model_prefix = kwargs['base_model_prefix'] + + self.build_backbone(self.backbone_cfg) + + def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + + # backbone do not need labels, only head need for loss compute + labels = input.pop(OutputKeys.LABELS, None) + + outputs = super().forward(input) + sequence_output, pooled_output = self.extract_backbone_outputs(outputs) + if labels is not None: + input[OutputKeys.LABELS] = labels + + return {OutputKeys.TEXT_EMBEDDING: sequence_output} diff --git a/modelscope/models/nlp/task_models/fill_mask.py b/modelscope/models/nlp/task_models/fill_mask.py new file mode 100644 index 00000000..f7ef1cc2 --- /dev/null +++ b/modelscope/models/nlp/task_models/fill_mask.py @@ -0,0 +1,47 @@ +from typing import Any, Dict + +import numpy as np + +from modelscope.metainfo import TaskModels +from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import BertConfig +from modelscope.models.nlp.task_models.task_model import \ + SingleBackboneTaskModelBase +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping + +__all__ = ['FillMaskModel'] + + +@MODELS.register_module(Tasks.fill_mask, module_name=TaskModels.fill_mask) +class FillMaskModel(SingleBackboneTaskModelBase): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the fill mask model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + if 'base_model_prefix' in kwargs: + self._base_model_prefix = kwargs['base_model_prefix'] + + self.build_backbone(self.backbone_cfg) + self.build_head(self.head_cfg) + + def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + + # backbone do not need labels, only head need for loss compute + labels = input.pop(OutputKeys.LABELS, None) + + outputs = super().forward(input) + sequence_output, pooled_output = self.extract_backbone_outputs(outputs) + outputs = self.head.forward(sequence_output) + + if labels is not None: + input[OutputKeys.LABELS] = labels + loss = self.compute_loss(outputs, labels) + outputs.update(loss) + outputs[OutputKeys.INPUT_IDS] = input[OutputKeys.INPUT_IDS] + return outputs diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py index 4792d07c..0a7d5a47 100644 --- a/modelscope/models/nlp/task_models/information_extraction.py +++ b/modelscope/models/nlp/task_models/information_extraction.py @@ -26,21 +26,12 @@ class InformationExtractionModel(SingleBackboneTaskModelBase): """ super().__init__(model_dir, *args, **kwargs) - backbone_cfg = self.cfg.backbone - head_cfg = self.cfg.head - self.build_backbone(backbone_cfg) - self.build_head(head_cfg) + self.build_backbone(self.backbone_cfg) + self.build_head(self.head_cfg) - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: + def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: outputs = super().forward(input) sequence_output, pooled_output = self.extract_backbone_outputs(outputs) outputs = self.head.forward(sequence_output, input['text'], input['offsets']) return {OutputKeys.SPO_LIST: outputs} - - def extract_backbone_outputs(self, outputs): - sequence_output = None - pooled_output = None - if hasattr(self.backbone, 'extract_sequence_outputs'): - sequence_output = self.backbone.extract_sequence_outputs(outputs) - return sequence_output, pooled_output diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py index 43a96327..1f5e46c3 100644 --- a/modelscope/models/nlp/task_models/sequence_classification.py +++ b/modelscope/models/nlp/task_models/sequence_classification.py @@ -11,10 +11,14 @@ from modelscope.models.nlp.task_models.task_model import \ SingleBackboneTaskModelBase from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks +from modelscope.utils.hub import parse_label_mapping __all__ = ['SequenceClassificationModel'] +@MODELS.register_module( + Tasks.sentence_similarity, module_name=TaskModels.text_classification) +@MODELS.register_module(Tasks.nli, module_name=TaskModels.text_classification) @MODELS.register_module( Tasks.sentiment_classification, module_name=TaskModels.text_classification) @MODELS.register_module( @@ -31,49 +35,36 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase): if 'base_model_prefix' in kwargs: self._base_model_prefix = kwargs['base_model_prefix'] - backbone_cfg = self.cfg.backbone - head_cfg = self.cfg.head - # get the num_labels from label_mapping.json self.id2label = {} - self.label_path = os.path.join(model_dir, 'label_mapping.json') - if os.path.exists(self.label_path): - with open(self.label_path) as f: - self.label_mapping = json.load(f) - self.id2label = { - idx: name - for name, idx in self.label_mapping.items() - } - head_cfg['num_labels'] = len(self.label_mapping) + # get the num_labels + num_labels = kwargs.get('num_labels') + if num_labels is None: + label2id = parse_label_mapping(model_dir) + if label2id is not None and len(label2id) > 0: + num_labels = len(label2id) + self.id2label = {id: label for label, id in label2id.items()} + self.head_cfg['num_labels'] = num_labels - self.build_backbone(backbone_cfg) - self.build_head(head_cfg) + self.build_backbone(self.backbone_cfg) + self.build_head(self.head_cfg) def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + # backbone do not need labels, only head need for loss compute + labels = input.pop(OutputKeys.LABELS, None) + outputs = super().forward(input) sequence_output, pooled_output = self.extract_backbone_outputs(outputs) outputs = self.head.forward(pooled_output) - if 'labels' in input: - loss = self.compute_loss(outputs, input['labels']) + if labels is not None: + input[OutputKeys.LABELS] = labels + loss = self.compute_loss(outputs, labels) outputs.update(loss) return outputs def extract_logits(self, outputs): return outputs[OutputKeys.LOGITS].cpu().detach() - def extract_backbone_outputs(self, outputs): - sequence_output = None - pooled_output = None - if hasattr(self.backbone, 'extract_sequence_outputs'): - sequence_output = self.backbone.extract_sequence_outputs(outputs) - if hasattr(self.backbone, 'extract_pooled_outputs'): - pooled_output = self.backbone.extract_pooled_outputs(outputs) - return sequence_output, pooled_output - - def compute_loss(self, outputs, labels): - loss = self.head.compute_loss(outputs, labels) - return loss - def postprocess(self, input, **kwargs): logits = self.extract_logits(input) probs = logits.softmax(-1).numpy() diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py index e93dd5f6..0b43044f 100644 --- a/modelscope/models/nlp/task_models/task_model.py +++ b/modelscope/models/nlp/task_models/task_model.py @@ -74,7 +74,7 @@ class BaseTaskModel(TorchModel, ABC): def __init__(self, model_dir: str, *args, **kwargs): super().__init__(model_dir, *args, **kwargs) - self.cfg = ConfigDict(kwargs) + self.config = ConfigDict(kwargs) def __repr__(self): # only log backbone and head name @@ -397,6 +397,9 @@ class SingleBackboneTaskModelBase(BaseTaskModel): def __init__(self, model_dir: str, *args, **kwargs): super().__init__(model_dir, *args, **kwargs) + self.backbone_cfg = self.config.get('backbone', None) + assert self.backbone_cfg is not None + self.head_cfg = self.config.get('head', None) def build_backbone(self, cfg): if 'prefix' in cfg: @@ -405,9 +408,13 @@ class SingleBackboneTaskModelBase(BaseTaskModel): setattr(self, cfg['prefix'], backbone) def build_head(self, cfg): + if cfg is None: + raise ValueError( + 'Head config is missing, check if this was a backbone-only model' + ) if 'prefix' in cfg: self._head_prefix = cfg['prefix'] - head = build_head(cfg) + head = build_head(cfg, group_key=self.group_key) setattr(self, self._head_prefix, head) return head @@ -431,8 +438,18 @@ class SingleBackboneTaskModelBase(BaseTaskModel): outputs = self.backbone.forward(**input) return outputs - def compute_loss(self, outputs: Dict[str, Any], labels): - raise NotImplementedError() + def compute_loss(self, outputs, labels): + loss = self.head.compute_loss(outputs, labels) + return loss + + def extract_backbone_outputs(self, outputs): + sequence_output = None + pooled_output = None + if hasattr(self.backbone, 'extract_sequence_outputs'): + sequence_output = self.backbone.extract_sequence_outputs(outputs) + if hasattr(self.backbone, 'extract_pooled_outputs'): + pooled_output = self.backbone.extract_pooled_outputs(outputs) + return sequence_output, pooled_output class EncoderDecoderTaskModelBase(BaseTaskModel): @@ -453,7 +470,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel): def build_encoder(self): encoder = build_backbone( - self.cfg, + self.config, type_name=self._encoder_key_in_cfg, task_name=Tasks.backbone) setattr(self, self._encoder_prefix, encoder) @@ -461,7 +478,7 @@ class EncoderDecoderTaskModelBase(BaseTaskModel): def build_decoder(self): decoder = build_backbone( - self.cfg, + self.config, type_name=self._decoder_key_in_cfg, task_name=Tasks.backbone) setattr(self, self._decoder_prefix, decoder) diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py index 5c22098f..f3930182 100644 --- a/modelscope/models/nlp/task_models/token_classification.py +++ b/modelscope/models/nlp/task_models/token_classification.py @@ -31,9 +31,6 @@ class TokenClassificationModel(SingleBackboneTaskModelBase): if 'base_model_prefix' in kwargs: self._base_model_prefix = kwargs['base_model_prefix'] - backbone_cfg = self.cfg.backbone - head_cfg = self.cfg.head - # get the num_labels num_labels = kwargs.get('num_labels') if num_labels is None: @@ -41,12 +38,12 @@ class TokenClassificationModel(SingleBackboneTaskModelBase): if label2id is not None and len(label2id) > 0: num_labels = len(label2id) self.id2label = {id: label for label, id in label2id.items()} - head_cfg['num_labels'] = num_labels + self.head_cfg['num_labels'] = num_labels - self.build_backbone(backbone_cfg) - self.build_head(head_cfg) + self.build_backbone(self.backbone_cfg) + self.build_head(self.head_cfg) - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: + def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: labels = None if OutputKeys.LABEL in input: labels = input.pop(OutputKeys.LABEL) @@ -71,10 +68,6 @@ class TokenClassificationModel(SingleBackboneTaskModelBase): sequence_output = self.backbone.extract_sequence_outputs(outputs) return sequence_output, pooled_output - def compute_loss(self, outputs, labels): - loss = self.head.compute_loss(outputs, labels) - return loss - def postprocess(self, input, **kwargs): logits = self.extract_logits(input) pred = torch.argmax(logits[0], dim=-1) diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py index c3723a61..c63e8037 100644 --- a/modelscope/models/nlp/token_classification.py +++ b/modelscope/models/nlp/token_classification.py @@ -10,12 +10,13 @@ from torch import nn from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.models.nlp.bert import BertPreTrainedModel +from modelscope.models.nlp.structbert import SbertPreTrainedModel from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks from modelscope.utils.hub import parse_label_mapping from modelscope.utils.tensor_utils import (torch_nested_detach, torch_nested_numpify) -from .structbert import SbertPreTrainedModel __all__ = ['SbertForTokenClassification'] @@ -171,3 +172,49 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel): pretrained_model_name_or_path=kwargs.get('model_dir'), model_dir=kwargs.get('model_dir'), **model_args) + + +@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert) +@MODELS.register_module(Tasks.token_classification, module_name=Models.bert) +class BertForSequenceClassification(TokenClassification, BertPreTrainedModel): + """Bert token classification model. + + Inherited from TokenClassificationBase. + """ + base_model_prefix: str = 'bert' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r'position_ids'] + + def __init__(self, config, model_dir): + if hasattr(config, 'base_model_prefix'): + BertForSequenceClassification.base_model_prefix = config.base_model_prefix + super().__init__(config, model_dir) + + def build_base_model(self): + from .bert import BertModel + return BertModel(self.config, add_pooling_layer=True) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs) diff --git a/modelscope/outputs.py b/modelscope/outputs.py index ce9e8d07..357afd07 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -417,6 +417,22 @@ TASK_OUTPUTS = { # } Tasks.fill_mask: [OutputKeys.TEXT], + # feature extraction result for single sample + # { + # "text_embedding": [[ + # [1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04], + # [6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01], + # [2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05] + # ], + # [ + # [2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05], + # [8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05], + # [3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05] + # ] + # ] + # } + Tasks.feature_extraction: [OutputKeys.TEXT_EMBEDDING], + # (Deprecated) dialog intent prediction result for single sample # {'output': {'prediction': array([2.62349960e-03, 4.12110658e-03, 4.12748595e-05, 3.77560973e-05, # 1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04, diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 51d50d51..4f6873b0 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -52,8 +52,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_vit_object-detection_coco'), Tasks.image_denoising: (Pipelines.image_denoise, 'damo/cv_nafnet_image-denoise_sidd'), - Tasks.text_classification: (Pipelines.sentiment_analysis, - 'damo/bert-base-sst2'), + Tasks.text_classification: + (Pipelines.sentiment_classification, + 'damo/nlp_structbert_sentiment-classification_chinese-base'), Tasks.text_generation: (Pipelines.text_generation, 'damo/nlp_palm2.0_text-generation_chinese-base'), Tasks.zero_shot_classification: @@ -80,6 +81,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.ocr_detection: (Pipelines.ocr_detection, 'damo/cv_resnet18_ocr-detection-line-level_damo'), Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'), + Tasks.feature_extraction: (Pipelines.feature_extraction, + 'damo/pert_feature-extraction_base-test'), Tasks.action_recognition: (Pipelines.action_recognition, 'damo/cv_TAdaConv_action-recognition'), Tasks.action_detection: (Pipelines.action_detection, diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index a8edc21a..5267b5b2 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -11,12 +11,13 @@ if TYPE_CHECKING: from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline from .document_segmentation_pipeline import DocumentSegmentationPipeline from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline + from .feature_extraction_pipeline import FeatureExtractionPipeline from .fill_mask_pipeline import FillMaskPipeline from .fill_mask_ponet_pipeline import FillMaskPonetPipeline from .information_extraction_pipeline import InformationExtractionPipeline from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline - from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline - from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline + from .passage_ranking_pipeline import PassageRankingPipeline + from .sentence_embedding_pipeline import SentenceEmbeddingPipeline from .sequence_classification_pipeline import SequenceClassificationPipeline from .summarization_pipeline import SummarizationPipeline from .text_classification_pipeline import TextClassificationPipeline @@ -27,8 +28,7 @@ if TYPE_CHECKING: from .translation_pipeline import TranslationPipeline from .word_segmentation_pipeline import WordSegmentationPipeline from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline - from .passage_ranking_pipeline import PassageRankingPipeline - from .sentence_embedding_pipeline import SentenceEmbeddingPipeline + else: _import_structure = { 'conversational_text_to_sql_pipeline': @@ -41,16 +41,15 @@ else: 'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'], 'document_segmentation_pipeline': ['DocumentSegmentationPipeline'], 'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'], + 'feature_extraction_pipeline': ['FeatureExtractionPipeline'], 'fill_mask_pipeline': ['FillMaskPipeline'], 'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'], + 'information_extraction_pipeline': ['InformationExtractionPipeline'], 'named_entity_recognition_pipeline': ['NamedEntityRecognitionPipeline'], - 'information_extraction_pipeline': ['InformationExtractionPipeline'], - 'pair_sentence_classification_pipeline': - ['PairSentenceClassificationPipeline'], + 'passage_ranking_pipeline': ['PassageRankingPipeline'], + 'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'], 'sequence_classification_pipeline': ['SequenceClassificationPipeline'], - 'single_sentence_classification_pipeline': - ['SingleSentenceClassificationPipeline'], 'summarization_pipeline': ['SummarizationPipeline'], 'text_classification_pipeline': ['TextClassificationPipeline'], 'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'], @@ -61,8 +60,6 @@ else: 'word_segmentation_pipeline': ['WordSegmentationPipeline'], 'zero_shot_classification_pipeline': ['ZeroShotClassificationPipeline'], - 'passage_ranking_pipeline': ['PassageRankingPipeline'], - 'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'] } import sys diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py new file mode 100644 index 00000000..3af0c28d --- /dev/null +++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py @@ -0,0 +1,82 @@ +import os +from typing import Any, Dict, Optional, Union + +import torch + +from modelscope.metainfo import Pipelines +from modelscope.models import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import NLPPreprocessor, Preprocessor +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks + +__all__ = ['FeatureExtractionPipeline'] + + +@PIPELINES.register_module( + Tasks.feature_extraction, module_name=Pipelines.feature_extraction) +class FeatureExtractionPipeline(Pipeline): + + def __init__(self, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, + first_sequence='sentence', + **kwargs): + """Use `model` and `preprocessor` to create a nlp feature extraction pipeline for prediction + + Args: + model (str or Model): Supply either a local model dir which supported feature extraction task, or a + no-head model id from the model hub, or a torch model instance. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. + first_sequence: The key to read the sentence in. + sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. + + NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' + param will have no effect. + + Example: + >>> from modelscope.pipelines import pipeline + >>> pipe_ins = pipeline('feature_extraction', model='damo/nlp_structbert_feature-extraction_english-large') + >>> input = 'Everything you love is treasure' + >>> print(pipe_ins(input)) + + + """ + model = model if isinstance(model, + Model) else Model.from_pretrained(model) + + if preprocessor is None: + preprocessor = NLPPreprocessor( + model.model_dir, + padding=kwargs.pop('padding', False), + sequence_length=kwargs.pop('sequence_length', 128)) + model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + + self.preprocessor = preprocessor + self.config = Config.from_file( + os.path.join(model.model_dir, ModelFile.CONFIGURATION)) + self.tokenizer = preprocessor.tokenizer + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + with torch.no_grad(): + return self.model(**inputs, **forward_params) + + def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): _description_ + + Returns: + Dict[str, str]: the prediction results + """ + + return { + OutputKeys.TEXT_EMBEDDING: + inputs[OutputKeys.TEXT_EMBEDDING].tolist() + } diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 12f4b80f..3d515e2d 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -10,7 +10,7 @@ from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import FillMaskPreprocessor, Preprocessor +from modelscope.preprocessors import NLPPreprocessor, Preprocessor from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks @@ -57,7 +57,7 @@ class FillMaskPipeline(Pipeline): model, Model) else Model.from_pretrained(model) if preprocessor is None: - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( fill_mask_model.model_dir, first_sequence=first_sequence, second_sequence=None, @@ -118,7 +118,10 @@ class FillMaskPipeline(Pipeline): logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy() input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy() pred_ids = np.argmax(logits, axis=-1) - model_type = self.model.config.model_type + if hasattr(self.model.config, 'backbone'): + model_type = self.model.config.backbone.type + else: + model_type = self.model.config.model_type process_type = model_type if model_type in self.mask_id else _type_map[ model_type] rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids, diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py index 07223d07..763e941c 100644 --- a/modelscope/pipelines/nlp/information_extraction_pipeline.py +++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py @@ -36,7 +36,7 @@ class InformationExtractionPipeline(Pipeline): def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: with torch.no_grad(): - return super().forward(inputs, **forward_params) + return self.model(**inputs, **forward_params) def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index 467d7aba..7275feca 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -9,7 +9,8 @@ from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import NERPreprocessor, Preprocessor +from modelscope.preprocessors import (Preprocessor, + TokenClassificationPreprocessor) from modelscope.utils.constant import Tasks __all__ = ['NamedEntityRecognitionPipeline'] @@ -46,7 +47,7 @@ class NamedEntityRecognitionPipeline(Pipeline): model = model if isinstance(model, Model) else Model.from_pretrained(model) if preprocessor is None: - preprocessor = NERPreprocessor( + preprocessor = TokenClassificationPreprocessor( model.model_dir, sequence_length=kwargs.pop('sequence_length', 512)) model.eval() diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py deleted file mode 100644 index bdb75c73..00000000 --- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Union - -from modelscope.models.base import Model -from ...metainfo import Pipelines -from ...preprocessors import (PairSentenceClassificationPreprocessor, - Preprocessor) -from ...utils.constant import Tasks -from ..builder import PIPELINES -from .sequence_classification_pipeline_base import \ - SequenceClassificationPipelineBase - -__all__ = ['PairSentenceClassificationPipeline'] - - -@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli) -@PIPELINES.register_module( - Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity) -class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase): - - def __init__(self, - model: Union[Model, str], - preprocessor: Preprocessor = None, - first_sequence='first_sequence', - second_sequence='second_sequence', - **kwargs): - """Use `model` and `preprocessor` to create a nlp pair sequence classification pipeline for prediction. - - Args: - model (str or Model): Supply either a local model dir which supported the sequence classification task, - or a model id from the model hub, or a torch model instance. - preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for - the model if supplied. - first_sequence: The key to read the first sentence in. - second_sequence: The key to read the second sentence in. - sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. - - NOTE: Inputs of type 'tuple' or 'list' are also supported. In this scenario, the 'first_sequence' and - 'second_sequence' param will have no effect. - - Example: - >>> from modelscope.pipelines import pipeline - >>> pipeline_ins = pipeline(task='nli', model='damo/nlp_structbert_nli_chinese-base') - >>> sentence1 = '四川商务职业学院和四川财经职业学院哪个好?' - >>> sentence2 = '四川商务职业学院商务管理在哪个校区?' - >>> print(pipeline_ins((sentence1, sentence2))) - >>> # Or use the dict input: - >>> print(pipeline_ins({'first_sequence': sentence1, 'second_sequence': sentence2})) - - To view other examples plese check the tests/pipelines/test_nli.py. - """ - if preprocessor is None: - preprocessor = PairSentenceClassificationPreprocessor( - model.model_dir if isinstance(model, Model) else model, - first_sequence=first_sequence, - second_sequence=second_sequence, - sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py index 7fe8aace..8d0e1dcd 100644 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py @@ -1,48 +1,64 @@ from typing import Any, Dict, Union import numpy as np +import torch from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.models.nlp import BertForSequenceClassification +from modelscope.models.base import Model from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors import (Preprocessor, + SequenceClassificationPreprocessor) from modelscope.utils.constant import Tasks -__all__ = ['SequenceClassificationPipeline'] - @PIPELINES.register_module( Tasks.text_classification, module_name=Pipelines.sentiment_analysis) +@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli) +@PIPELINES.register_module( + Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity) +@PIPELINES.register_module( + Tasks.text_classification, module_name=Pipelines.sentiment_classification) class SequenceClassificationPipeline(Pipeline): def __init__(self, - model: Union[BertForSequenceClassification, str], - preprocessor: SequenceClassificationPreprocessor = None, + model: Union[Model, str], + preprocessor: Preprocessor = None, **kwargs): - """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction + """This is the base class for all the sequence classification sub-tasks. Args: - model (BertForSequenceClassification): a model instance - preprocessor (SequenceClassificationPreprocessor): a preprocessor instance + model (str or Model): A model instance or a model local dir or a model id in the model hub. + preprocessor (Preprocessor): a preprocessor instance, must not be None. """ - assert isinstance(model, str) or isinstance(model, BertForSequenceClassification), \ - 'model must be a single str or BertForSequenceClassification' - sc_model = model if isinstance( - model, - BertForSequenceClassification) else Model.from_pretrained(model) + assert isinstance(model, str) or isinstance(model, Model), \ + 'model must be a single str or Model' + model = model if isinstance(model, + Model) else Model.from_pretrained(model) + first_sequence = kwargs.pop('first_sequence', 'first_sequence') + second_sequence = kwargs.pop('second_sequence', None) + if preprocessor is None: preprocessor = SequenceClassificationPreprocessor( - sc_model.model_dir, - first_sequence='sentence', - second_sequence=None, + model.model_dir if isinstance(model, Model) else model, + first_sequence=first_sequence, + second_sequence=second_sequence, sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) - assert hasattr(self.model, 'id2label'), \ - 'id2label map should be initalizaed in init function.' + assert preprocessor is not None + model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.id2label = kwargs.get('id2label') + if self.id2label is None and hasattr(self.preprocessor, 'id2label'): + self.id2label = self.preprocessor.id2label + assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ + 'as a parameter or make sure the preprocessor has the attribute.' + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + with torch.no_grad(): + return self.model(**inputs, **forward_params) def postprocess(self, inputs: Dict[str, Any], @@ -50,20 +66,18 @@ class SequenceClassificationPipeline(Pipeline): """process the prediction results Args: - inputs (Dict[str, Any]): input data dict - topk (int): return topk classification result. - + inputs (Dict[str, Any]): _description_ + topk (int): The topk probs to take Returns: Dict[str, str]: the prediction results """ - # NxC np.ndarray - probs = inputs['probs'][0] + + probs = inputs[OutputKeys.PROBABILITIES][0] num_classes = probs.shape[0] topk = min(topk, num_classes) top_indices = np.argpartition(probs, -topk)[-topk:] cls_ids = top_indices[np.argsort(probs[top_indices])] probs = probs[cls_ids].tolist() - cls_names = [self.model.id2label[cid] for cid in cls_ids] - + cls_names = [self.id2label[cid] for cid in cls_ids] return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names} diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py deleted file mode 100644 index 3d8e8fea..00000000 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Any, Dict, Union - -import numpy as np -import torch - -from modelscope.models.base import Model -from modelscope.outputs import OutputKeys -from ...preprocessors import Preprocessor -from ..base import Pipeline - - -class SequenceClassificationPipelineBase(Pipeline): - - def __init__(self, model: Union[Model, str], preprocessor: Preprocessor, - **kwargs): - """This is the base class for all the sequence classification sub-tasks. - - Args: - model (str or Model): A model instance or a model local dir or a model id in the model hub. - preprocessor (Preprocessor): a preprocessor instance, must not be None. - """ - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or Model' - model = model if isinstance(model, - Model) else Model.from_pretrained(model) - assert preprocessor is not None - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - self.id2label = kwargs.get('id2label') - if self.id2label is None and hasattr(self.preprocessor, 'id2label'): - self.id2label = self.preprocessor.id2label - assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ - 'as a parameter or make sure the preprocessor has the attribute.' - - def forward(self, inputs: Dict[str, Any], - **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return self.model(**inputs, **forward_params) - - def postprocess(self, - inputs: Dict[str, Any], - topk: int = 5) -> Dict[str, str]: - """process the prediction results - - Args: - inputs (Dict[str, Any]): _description_ - topk (int): The topk probs to take - Returns: - Dict[str, str]: the prediction results - """ - - probs = inputs[OutputKeys.PROBABILITIES][0] - num_classes = probs.shape[0] - topk = min(topk, num_classes) - top_indices = np.argpartition(probs, -topk)[-topk:] - cls_ids = top_indices[np.argsort(probs[top_indices])] - probs = probs[cls_ids].tolist() - - cls_names = [self.id2label[cid] for cid in cls_ids] - return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names} diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py deleted file mode 100644 index 0a2f6d25..00000000 --- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from typing import Union - -from ...metainfo import Pipelines -from ...models import Model -from ...preprocessors import (Preprocessor, - SingleSentenceClassificationPreprocessor) -from ...utils.constant import Tasks -from ..builder import PIPELINES -from .sequence_classification_pipeline_base import \ - SequenceClassificationPipelineBase - -__all__ = ['SingleSentenceClassificationPipeline'] - - -@PIPELINES.register_module( - Tasks.sentiment_classification, - module_name=Pipelines.sentiment_classification) -class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase): - - def __init__(self, - model: Union[Model, str], - preprocessor: Preprocessor = None, - first_sequence='first_sequence', - **kwargs): - """Use `model` and `preprocessor` to create a nlp single sequence classification pipeline for prediction. - - Args: - model (str or Model): Supply either a local model dir which supported the sequence classification task, - or a model id from the model hub, or a torch model instance. - preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for - the model if supplied. - first_sequence: The key to read the first sentence in. - sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value. - - NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' - param will have no effect. - - Example: - >>> from modelscope.pipelines import pipeline - >>> pipeline_ins = pipeline(task='sentiment-classification', - >>> model='damo/nlp_structbert_sentiment-classification_chinese-base') - >>> sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音' - >>> print(pipeline_ins(sentence1)) - >>> # Or use the dict input: - >>> print(pipeline_ins({'first_sequence': sentence1})) - - To view other examples plese check the tests/pipelines/test_sentiment-classification.py. - """ - if preprocessor is None: - preprocessor = SingleSentenceClassificationPreprocessor( - model.model_dir if isinstance(model, Model) else model, - first_sequence=first_sequence, - sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py index aabf48d8..5367c1a8 100644 --- a/modelscope/pipelines/nlp/token_classification_pipeline.py +++ b/modelscope/pipelines/nlp/token_classification_pipeline.py @@ -49,7 +49,7 @@ class TokenClassificationPipeline(Pipeline): text = inputs.pop(OutputKeys.TEXT) with torch.no_grad(): return { - **self.model(inputs, **forward_params), OutputKeys.TEXT: text + **self.model(**inputs, **forward_params), OutputKeys.TEXT: text } def postprocess(self, inputs: Dict[str, Any], diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index b4be1845..90303b65 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -16,17 +16,23 @@ if TYPE_CHECKING: from .kws import WavToLists from .multi_modal import (OfaPreprocessor, MPlugPreprocessor) from .nlp import ( - Tokenize, SequenceClassificationPreprocessor, - TextGenerationPreprocessor, TokenClassificationPreprocessor, - SingleSentenceClassificationPreprocessor, - PairSentenceClassificationPreprocessor, FillMaskPreprocessor, - ZeroShotClassificationPreprocessor, NERPreprocessor, - TextErrorCorrectionPreprocessor, FaqQuestionAnsweringPreprocessor, - SequenceLabelingPreprocessor, RelationExtractionPreprocessor, - DocumentSegmentationPreprocessor, FillMaskPoNetPreprocessor, - PassageRankingPreprocessor, SentenceEmbeddingPreprocessor, + DocumentSegmentationPreprocessor, + FaqQuestionAnsweringPreprocessor, + FillMaskPoNetPreprocessor, + NLPPreprocessor, + NLPTokenizerPreprocessorBase, + PassageRankingPreprocessor, + RelationExtractionPreprocessor, + SentenceEmbeddingPreprocessor, + SequenceClassificationPreprocessor, + TokenClassificationPreprocessor, + TextErrorCorrectionPreprocessor, + TextGenerationPreprocessor, Text2TextGenerationPreprocessor, - WordSegmentationBlankSetToLabelPreprocessor) + Tokenize, + WordSegmentationBlankSetToLabelPreprocessor, + ZeroShotClassificationPreprocessor, + ) from .space import (DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, DialogStateTrackingPreprocessor) @@ -49,18 +55,22 @@ else: 'kws': ['WavToLists'], 'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'], 'nlp': [ - 'Tokenize', 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', - 'SingleSentenceClassificationPreprocessor', - 'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor', - 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', - 'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor', - 'TextErrorCorrectionPreprocessor', - 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', + 'DocumentSegmentationPreprocessor', + 'FaqQuestionAnsweringPreprocessor', + 'FillMaskPoNetPreprocessor', + 'NLPPreprocessor', + 'NLPTokenizerPreprocessorBase', + 'PassageRankingPreprocessor', 'RelationExtractionPreprocessor', + 'SentenceEmbeddingPreprocessor', + 'SequenceClassificationPreprocessor', + 'TokenClassificationPreprocessor', + 'TextErrorCorrectionPreprocessor', + 'TextGenerationPreprocessor', + 'Tokenize', 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', - 'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor' + 'ZeroShotClassificationPreprocessor', ], 'space': [ 'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor', diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index 8e75ae98..dfbb5c81 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -6,32 +6,41 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .text_error_correction import TextErrorCorrectionPreprocessor from .nlp_base import ( - Tokenize, SequenceClassificationPreprocessor, - TextGenerationPreprocessor, TokenClassificationPreprocessor, - SingleSentenceClassificationPreprocessor, - Text2TextGenerationPreprocessor, - PairSentenceClassificationPreprocessor, FillMaskPreprocessor, - ZeroShotClassificationPreprocessor, NERPreprocessor, - FaqQuestionAnsweringPreprocessor, SequenceLabelingPreprocessor, - RelationExtractionPreprocessor, DocumentSegmentationPreprocessor, - FillMaskPoNetPreprocessor, PassageRankingPreprocessor, + DocumentSegmentationPreprocessor, + FaqQuestionAnsweringPreprocessor, + FillMaskPoNetPreprocessor, + NLPPreprocessor, + NLPTokenizerPreprocessorBase, + PassageRankingPreprocessor, + RelationExtractionPreprocessor, SentenceEmbeddingPreprocessor, - WordSegmentationBlankSetToLabelPreprocessor) + SequenceClassificationPreprocessor, + TokenClassificationPreprocessor, + TextGenerationPreprocessor, + Text2TextGenerationPreprocessor, + Tokenize, + WordSegmentationBlankSetToLabelPreprocessor, + ZeroShotClassificationPreprocessor, + ) else: _import_structure = { 'nlp_base': [ - 'Tokenize', 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', - 'SingleSentenceClassificationPreprocessor', - 'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor', - 'ZeroShotClassificationPreprocessor', 'NERPreprocessor', - 'SentenceEmbeddingPreprocessor', 'PassageRankingPreprocessor', - 'FaqQuestionAnsweringPreprocessor', 'SequenceLabelingPreprocessor', + 'DocumentSegmentationPreprocessor', + 'FaqQuestionAnsweringPreprocessor', + 'FillMaskPoNetPreprocessor', + 'NLPPreprocessor', + 'NLPTokenizerPreprocessorBase', + 'PassageRankingPreprocessor', 'RelationExtractionPreprocessor', + 'SentenceEmbeddingPreprocessor', + 'SequenceClassificationPreprocessor', + 'TokenClassificationPreprocessor', + 'TextGenerationPreprocessor', + 'Tokenize', 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', - 'DocumentSegmentationPreprocessor', 'FillMaskPoNetPreprocessor' + 'ZeroShotClassificationPreprocessor', ], 'text_error_correction': [ 'TextErrorCorrectionPreprocessor', diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index d6325eed..6b559de9 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -2,14 +2,13 @@ import os.path as osp import re -import uuid from typing import Any, Dict, Iterable, Optional, Tuple, Union import numpy as np -from transformers import AutoTokenizer, BertTokenizerFast +import torch +from transformers import AutoTokenizer from modelscope.metainfo import Models, Preprocessors -from modelscope.models.nlp.structbert import SbertTokenizerFast from modelscope.outputs import OutputKeys from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS @@ -23,24 +22,21 @@ from modelscope.utils.type_assert import type_assert logger = get_logger() __all__ = [ - 'Tokenize', - 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', - 'TokenClassificationPreprocessor', - 'PairSentenceClassificationPreprocessor', - 'Text2TextGenerationPreprocessor', - 'SingleSentenceClassificationPreprocessor', - 'FillMaskPreprocessor', - 'ZeroShotClassificationPreprocessor', - 'NERPreprocessor', - 'SentenceEmbeddingPreprocessor', - 'PassageRankingPreprocessor', - 'FaqQuestionAnsweringPreprocessor', - 'SequenceLabelingPreprocessor', - 'RelationExtractionPreprocessor', 'DocumentSegmentationPreprocessor', + 'FaqQuestionAnsweringPreprocessor', + 'NLPPreprocessor', 'FillMaskPoNetPreprocessor', + 'NLPTokenizerPreprocessorBase', + 'PassageRankingPreprocessor', + 'RelationExtractionPreprocessor', + 'SentenceEmbeddingPreprocessor', + 'SequenceClassificationPreprocessor', + 'TokenClassificationPreprocessor', + 'Text2TextGenerationPreprocessor', + 'TextGenerationPreprocessor', + 'Tokenize', 'WordSegmentationBlankSetToLabelPreprocessor', + 'ZeroShotClassificationPreprocessor', ] @@ -48,85 +44,19 @@ __all__ = [ class Tokenize(Preprocessor): def __init__(self, tokenizer_name) -> None: - self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]: if isinstance(data, str): data = {InputFields.text: data} - token_dict = self._tokenizer(data[InputFields.text]) + token_dict = self.tokenizer(data[InputFields.text]) data.update(token_dict) return data -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer) -class SequenceClassificationPreprocessor(Preprocessor): - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - from easynlp.modelzoo import AutoTokenizer - self.model_dir: str = model_dir - self.first_sequence: str = kwargs.pop('first_sequence', - 'first_sequence') - self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') - self.sequence_length = kwargs.pop('sequence_length', 128) - - self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) - print(f'this is the tokenzier {self.tokenizer}') - self.label2id = parse_label_mapping(self.model_dir) - - @type_assert(object, (str, tuple, Dict)) - def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: - feature = super().__call__(data) - if isinstance(data, str): - new_data = {self.first_sequence: data} - elif isinstance(data, tuple): - sentence1, sentence2 = data - new_data = { - self.first_sequence: sentence1, - self.second_sequence: sentence2 - } - else: - new_data = data - - # preprocess the data for the model input - - rst = { - 'id': [], - 'input_ids': [], - 'attention_mask': [], - 'token_type_ids': [], - } - - max_seq_length = self.sequence_length - - text_a = new_data[self.first_sequence] - text_b = new_data.get(self.second_sequence, None) - - feature = self.tokenizer( - text_a, - text_b, - padding='max_length', - truncation=True, - max_length=max_seq_length) - - rst['id'].append(new_data.get('id', str(uuid.uuid4()))) - rst['input_ids'].append(feature['input_ids']) - rst['attention_mask'].append(feature['attention_mask']) - rst['token_type_ids'].append(feature['token_type_ids']) - return rst - - class NLPTokenizerPreprocessorBase(Preprocessor): - def __init__(self, model_dir: str, pair: bool, mode: str, **kwargs): + def __init__(self, model_dir: str, mode: str, **kwargs): """The NLP tokenizer preprocessor base class. Any nlp preprocessor which uses the hf tokenizer can inherit from this class. @@ -138,7 +68,6 @@ class NLPTokenizerPreprocessorBase(Preprocessor): label: The label key label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping if this mapping is not supplied. - pair (bool): Pair sentence input or single sentence input. mode: Run this preprocessor in either 'train'/'eval'/'inference' mode kwargs: These kwargs will be directly fed into the tokenizer. """ @@ -148,7 +77,8 @@ class NLPTokenizerPreprocessorBase(Preprocessor): self.first_sequence: str = kwargs.pop('first_sequence', 'first_sequence') self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') - self.pair = pair + self.sequence_length = kwargs.pop('sequence_length', 128) + self._mode = mode self.label = kwargs.pop('label', OutputKeys.LABEL) self.label2id = None @@ -158,6 +88,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor): self.label2id = parse_label_mapping(self.model_dir) self.tokenize_kwargs = kwargs + self.tokenizer = self.build_tokenizer(model_dir) @property @@ -179,20 +110,38 @@ class NLPTokenizerPreprocessorBase(Preprocessor): @param model_dir: The local model dir. @return: The initialized tokenizer. """ - + self.is_transformer_based_model = 'lstm' not in model_dir + # fast version lead to parallel inference failed model_type = get_model_type(model_dir) if model_type in (Models.structbert, Models.gpt3, Models.palm, Models.plug): - from modelscope.models.nlp.structbert import SbertTokenizer - return SbertTokenizer.from_pretrained(model_dir, use_fast=False) + from modelscope.models.nlp.structbert import SbertTokenizer, SbertTokenizerFast + return SbertTokenizer.from_pretrained( + model_dir + ) if self._mode == ModeKeys.INFERENCE else SbertTokenizerFast.from_pretrained( + model_dir) elif model_type == Models.veco: - from modelscope.models.nlp.veco import VecoTokenizer - return VecoTokenizer.from_pretrained(model_dir) + from modelscope.models.nlp.veco import VecoTokenizer, VecoTokenizerFast + return VecoTokenizer.from_pretrained( + model_dir + ) if self._mode == ModeKeys.INFERENCE else VecoTokenizerFast.from_pretrained( + model_dir) elif model_type == Models.deberta_v2: - from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer - return DebertaV2Tokenizer.from_pretrained(model_dir) + from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast + return DebertaV2Tokenizer.from_pretrained( + model_dir + ) if self._mode == ModeKeys.INFERENCE else DebertaV2TokenizerFast.from_pretrained( + model_dir) + elif not self.is_transformer_based_model: + from transformers import BertTokenizer, BertTokenizerFast + return BertTokenizer.from_pretrained( + model_dir + ) if self._mode == ModeKeys.INFERENCE else BertTokenizerFast.from_pretrained( + model_dir) else: - return AutoTokenizer.from_pretrained(model_dir, use_fast=False) + return AutoTokenizer.from_pretrained( + model_dir, + use_fast=False if self._mode == ModeKeys.INFERENCE else True) def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]: """process the raw input data @@ -239,7 +188,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor): if len(data) == 3: text_a, text_b, labels = data elif len(data) == 2: - if self.pair: + if self._mode == ModeKeys.INFERENCE: text_a, text_b = data else: text_a, labels = data @@ -277,6 +226,22 @@ class NLPTokenizerPreprocessorBase(Preprocessor): output[OutputKeys.LABELS] = labels +@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.feature_extraction) +class NLPPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in MLM task. + """ + + def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + kwargs['truncation'] = kwargs.get('truncation', True) + kwargs['padding'] = kwargs.get('padding', 'max_length') + kwargs['max_length'] = kwargs.pop('sequence_length', 128) + kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', + True) + super().__init__(model_dir, mode=mode, **kwargs) + + @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.passage_ranking) class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase): @@ -337,22 +302,12 @@ class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase): Fields.nlp, module_name=Preprocessors.nli_tokenizer) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer) -class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in pair sentence classification. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get( - 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, pair=True, mode=mode, **kwargs) - - +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) -class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in single sentence classification. +class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): + """The tokenizer preprocessor used in sequence classification. """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): @@ -360,7 +315,7 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): kwargs['padding'] = kwargs.get( 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) + super().__init__(model_dir, mode=mode, **kwargs) @PREPROCESSORS.register_module( @@ -421,7 +376,7 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): model_dir (str): model path """ self.sequence_length = kwargs.pop('sequence_length', 512) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) + super().__init__(model_dir, mode=mode, **kwargs) def __call__(self, data: Union[str, Dict], hypothesis_template: str, candidate_labels: list) -> Dict[str, Any]: @@ -496,14 +451,12 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): tokenizer=None, mode=ModeKeys.INFERENCE, **kwargs): - self.tokenizer = self.build_tokenizer( - model_dir) if tokenizer is None else tokenizer kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get('padding', 'max_length') kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', False) kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) + super().__init__(model_dir, mode=mode, **kwargs) @staticmethod def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]: @@ -541,20 +494,6 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): } -@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask) -class FillMaskPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in MLM task. - """ - - def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - kwargs['truncation'] = kwargs.get('truncation', True) - kwargs['padding'] = kwargs.get('padding', 'max_length') - kwargs['max_length'] = kwargs.pop('sequence_length', 128) - kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', - True) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) - - @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.word_segment_text_to_label_preprocessor) @@ -592,21 +531,40 @@ class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor): } +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.ner_tokenizer) @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer) class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): - """The tokenizer preprocessor used in normal token classification task. + """The tokenizer preprocessor used in normal NER task. """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): + """preprocess the data + + Args: + model_dir (str): model path + """ kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get( 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') kwargs['max_length'] = kwargs.pop('sequence_length', 128) self.label_all_tokens = kwargs.pop('label_all_tokens', False) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) + super().__init__(model_dir, mode=mode, **kwargs) - def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]: + if 'is_split_into_words' in kwargs: + self.is_split_into_words = kwargs.pop('is_split_into_words') + else: + self.is_split_into_words = self.tokenizer.init_kwargs.get( + 'is_split_into_words', False) + if 'label2id' in kwargs: + kwargs.pop('label2id') + self.tokenize_kwargs = kwargs + + @type_assert(object, str) + def __call__(self, data: str) -> Dict[str, Any]: """process the raw input data Args: @@ -618,23 +576,84 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): Dict[str, Any]: the preprocessed data """ - text_a = None + # preprocess the data for the model input + text = None labels_list = None if isinstance(data, str): - text_a = data + text = data elif isinstance(data, dict): - text_a = data.get(self.first_sequence) + text = data.get(self.first_sequence) labels_list = data.get(self.label) - if isinstance(text_a, str): - text_a = text_a.replace(' ', '').strip() + input_ids = [] + label_mask = [] + offset_mapping = [] + if self.is_split_into_words: + for offset, token in enumerate(list(data)): + subtoken_ids = self.tokenizer.encode( + token, add_special_tokens=False) + if len(subtoken_ids) == 0: + subtoken_ids = [self.tokenizer.unk_token_id] + input_ids.extend(subtoken_ids) + label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) + offset_mapping.extend([(offset, offset + 1)]) + else: + if self.tokenizer.is_fast: + encodings = self.tokenizer( + text, + add_special_tokens=False, + return_offsets_mapping=True, + **self.tokenize_kwargs) + input_ids = encodings['input_ids'] + word_ids = encodings.word_ids() + for i in range(len(word_ids)): + if word_ids[i] is None: + label_mask.append(0) + elif word_ids[i] == word_ids[i - 1]: + label_mask.append(0) + offset_mapping[-1] = ( + offset_mapping[-1][0], + encodings['offset_mapping'][i][1]) + else: + label_mask.append(1) + offset_mapping.append(encodings['offset_mapping'][i]) + else: + encodings = self.tokenizer( + text, add_special_tokens=False, **self.tokenize_kwargs) + input_ids = encodings['input_ids'] + label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( + text) - tokenized_inputs = self.tokenizer( - [t for t in text_a], - return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, - is_split_into_words=True, - **self.tokenize_kwargs) + if len(input_ids) >= self.sequence_length - 2: + input_ids = input_ids[:self.sequence_length - 2] + label_mask = label_mask[:self.sequence_length - 2] + input_ids = [self.tokenizer.cls_token_id + ] + input_ids + [self.tokenizer.sep_token_id] + label_mask = [0] + label_mask + [0] + attention_mask = [1] * len(input_ids) + offset_mapping = offset_mapping[:sum(label_mask)] + if not self.is_transformer_based_model: + input_ids = input_ids[1:-1] + attention_mask = attention_mask[1:-1] + label_mask = label_mask[1:-1] + + if self._mode == ModeKeys.INFERENCE: + input_ids = torch.tensor(input_ids).unsqueeze(0) + attention_mask = torch.tensor(attention_mask).unsqueeze(0) + label_mask = torch.tensor( + label_mask, dtype=torch.bool).unsqueeze(0) + + # the token classification + output = { + 'text': text, + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'label_mask': label_mask, + 'offset_mapping': offset_mapping + } + + # align the labels with tokenized text if labels_list is not None: assert self.label2id is not None # Map that sends B-Xxx label to its I-Xxx counterpart @@ -653,7 +672,6 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): b_to_i_label.append(idx) label_row = [self.label2id[lb] for lb in labels_list] - word_ids = tokenized_inputs.word_ids() previous_word_idx = None label_ids = [] for word_idx in word_ids: @@ -668,229 +686,66 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): label_ids.append(-100) previous_word_idx = word_idx labels = label_ids - tokenized_inputs['labels'] = labels - # new code end + output['labels'] = labels + return output - if self._mode == ModeKeys.INFERENCE: - tokenized_inputs[OutputKeys.TEXT] = text_a - return tokenized_inputs + def get_tokenizer_class(self): + tokenizer_class = self.tokenizer.__class__.__name__ + if tokenizer_class.endswith( + 'Fast') and tokenizer_class != 'PreTrainedTokenizerFast': + tokenizer_class = tokenizer_class[:-4] + return tokenizer_class - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.ner_tokenizer) -class NERPreprocessor(Preprocessor): - """The tokenizer preprocessor used in normal NER task. - - NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition. - """ - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - self.model_dir: str = model_dir - self.sequence_length = kwargs.pop('sequence_length', 512) - self.is_transformer_based_model = 'lstm' not in model_dir - if self.is_transformer_based_model: - self.tokenizer = AutoTokenizer.from_pretrained( - model_dir, use_fast=True) - else: - self.tokenizer = BertTokenizerFast.from_pretrained( - model_dir, use_fast=True) - self.is_split_into_words = self.tokenizer.init_kwargs.get( - 'is_split_into_words', False) - - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - - # preprocess the data for the model input - text = data - if self.is_split_into_words: - input_ids = [] - label_mask = [] - offset_mapping = [] - for offset, token in enumerate(list(data)): - subtoken_ids = self.tokenizer.encode( - token, add_special_tokens=False) - if len(subtoken_ids) == 0: - subtoken_ids = [self.tokenizer.unk_token_id] - input_ids.extend(subtoken_ids) - label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) - offset_mapping.extend([(offset, offset + 1)] - + [(offset + 1, offset + 1)] - * (len(subtoken_ids) - 1)) - if len(input_ids) >= self.sequence_length - 2: - input_ids = input_ids[:self.sequence_length - 2] - label_mask = label_mask[:self.sequence_length - 2] - offset_mapping = offset_mapping[:self.sequence_length - 2] - input_ids = [self.tokenizer.cls_token_id - ] + input_ids + [self.tokenizer.sep_token_id] - label_mask = [0] + label_mask + [0] - attention_mask = [1] * len(input_ids) - else: - encodings = self.tokenizer( - text, - add_special_tokens=True, - padding=True, - truncation=True, - max_length=self.sequence_length, - return_offsets_mapping=True) - input_ids = encodings['input_ids'] - attention_mask = encodings['attention_mask'] - word_ids = encodings.word_ids() - label_mask = [] - offset_mapping = [] - for i in range(len(word_ids)): - if word_ids[i] is None: - label_mask.append(0) - elif word_ids[i] == word_ids[i - 1]: - label_mask.append(0) - offset_mapping[-1] = (offset_mapping[-1][0], - encodings['offset_mapping'][i][1]) + def get_label_mask_and_offset_mapping(self, text): + label_mask = [] + offset_mapping = [] + tokens = self.tokenizer.tokenize(text) + offset = 0 + if self.get_tokenizer_class() == 'BertTokenizer': + for token in tokens: + is_start = (token[:2] != '##') + if is_start: + label_mask.append(True) else: - label_mask.append(1) - offset_mapping.append(encodings['offset_mapping'][i]) - - if not self.is_transformer_based_model: - input_ids = input_ids[1:-1] - attention_mask = attention_mask[1:-1] - label_mask = label_mask[1:-1] - return { - 'text': text, - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'label_mask': label_mask, - 'offset_mapping': offset_mapping - } - - -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer) -class SequenceLabelingPreprocessor(Preprocessor): - """The tokenizer preprocessor used in normal NER task. - - NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition. - """ - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - self.model_dir: str = model_dir - self.sequence_length = kwargs.pop('sequence_length', 512) - - if 'lstm' in model_dir or 'gcnn' in model_dir: - self.tokenizer = BertTokenizerFast.from_pretrained( - model_dir, use_fast=False) - elif 'structbert' in model_dir: - self.tokenizer = SbertTokenizerFast.from_pretrained( - model_dir, use_fast=False) - else: - self.tokenizer = AutoTokenizer.from_pretrained( - model_dir, use_fast=False) - self.is_split_into_words = self.tokenizer.init_kwargs.get( - 'is_split_into_words', False) - - @type_assert(object, str) - def __call__(self, data: str) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - - # preprocess the data for the model input - text = data - if self.is_split_into_words: - input_ids = [] - label_mask = [] - offset_mapping = [] - for offset, token in enumerate(list(data)): - subtoken_ids = self.tokenizer.encode( - token, add_special_tokens=False) - if len(subtoken_ids) == 0: - subtoken_ids = [self.tokenizer.unk_token_id] - input_ids.extend(subtoken_ids) - label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) - offset_mapping.extend([(offset, offset + 1)] - + [(offset + 1, offset + 1)] - * (len(subtoken_ids) - 1)) - if len(input_ids) >= self.sequence_length - 2: - input_ids = input_ids[:self.sequence_length - 2] - label_mask = label_mask[:self.sequence_length - 2] - offset_mapping = offset_mapping[:self.sequence_length - 2] - input_ids = [self.tokenizer.cls_token_id - ] + input_ids + [self.tokenizer.sep_token_id] - label_mask = [0] + label_mask + [0] - attention_mask = [1] * len(input_ids) - else: - encodings = self.tokenizer( - text, - add_special_tokens=True, - padding=True, - truncation=True, - max_length=self.sequence_length, - return_offsets_mapping=True) - input_ids = encodings['input_ids'] - attention_mask = encodings['attention_mask'] - word_ids = encodings.word_ids() - label_mask = [] - offset_mapping = [] - for i in range(len(word_ids)): - if word_ids[i] is None: - label_mask.append(0) - elif word_ids[i] == word_ids[i - 1]: - label_mask.append(0) - offset_mapping[-1] = (offset_mapping[-1][0], - encodings['offset_mapping'][i][1]) + token = token[2:] + label_mask.append(False) + start = offset + text[offset:].index(token) + end = start + len(token) + if is_start: + offset_mapping.append((start, end)) else: - label_mask.append(1) - offset_mapping.append(encodings['offset_mapping'][i]) + offset_mapping[-1] = (offset_mapping[-1][0], end) + offset = end + elif self.get_tokenizer_class() == 'XLMRobertaTokenizer': + last_is_blank = False + for token in tokens: + is_start = (token[0] == '▁') + if is_start: + token = token[1:] + label_mask.append(True) + if len(token) == 0: + last_is_blank = True + continue + else: + label_mask.append(False) + start = offset + text[offset:].index(token) + end = start + len(token) + if last_is_blank or is_start: + offset_mapping.append((start, end)) + else: + offset_mapping[-1] = (offset_mapping[-1][0], end) + offset = end + last_is_blank = False + else: + raise NotImplementedError - if not self.is_transformer_based_model: - input_ids = input_ids[1:-1] - attention_mask = attention_mask[1:-1] - label_mask = label_mask[1:-1] - return { - 'text': text, - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'label_mask': label_mask, - 'offset_mapping': offset_mapping - } + return label_mask, offset_mapping @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.re_tokenizer) class RelationExtractionPreprocessor(Preprocessor): - """The tokenizer preprocessor used in normal RE task. - - NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition. + """The relation extraction preprocessor used in normal RE task. """ def __init__(self, model_dir: str, *args, **kwargs): @@ -937,7 +792,7 @@ class FaqQuestionAnsweringPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): super(FaqQuestionAnsweringPreprocessor, self).__init__( - model_dir, pair=False, mode=ModeKeys.INFERENCE, **kwargs) + model_dir, mode=ModeKeys.INFERENCE, **kwargs) import os from transformers import BertTokenizer @@ -1026,7 +881,7 @@ class DocumentSegmentationPreprocessor(Preprocessor): """ super().__init__(*args, **kwargs) - + from transformers import BertTokenizerFast self.tokenizer = BertTokenizerFast.from_pretrained( model_dir, use_fast=True, diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 75add1d9..b19c0fce 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -115,6 +115,7 @@ class NLPTasks(object): conversational_text_to_sql = 'conversational-text-to-sql' information_extraction = 'information-extraction' document_segmentation = 'document-segmentation' + feature_extraction = 'feature-extraction' class AudioTasks(object): diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py index 3cf88114..7a9c79e2 100644 --- a/modelscope/utils/registry.py +++ b/modelscope/utils/registry.py @@ -74,7 +74,6 @@ class Registry(object): raise KeyError(f'{module_name} is already registered in ' f'{self._name}[{group_key}]') self._modules[group_key][module_name] = module_cls - module_cls.group_key = group_key def register_module(self, group_key: str = default_group, @@ -196,6 +195,7 @@ def build_from_cfg(cfg, if obj_cls is None: raise KeyError(f'{obj_type} is not in the {registry.name}' f' registry group {group_key}') + obj_cls.group_key = group_key elif inspect.isclass(obj_type) or inspect.isfunction(obj_type): obj_cls = obj_type else: diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py index 762530f4..91a3b5c5 100644 --- a/tests/msdatasets/test_ms_dataset.py +++ b/tests/msdatasets/test_ms_dataset.py @@ -75,7 +75,8 @@ class MsDatasetTest(unittest.TestCase): preprocessor = SequenceClassificationPreprocessor( nlp_model.model_dir, first_sequence='premise', - second_sequence=None) + second_sequence=None, + padding='max_length') ms_ds_train = MsDataset.load( 'xcopa', subset_name='translation-et', diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py index 4f3206cd..549d2cb3 100644 --- a/tests/pipelines/test_deberta_tasks.py +++ b/tests/pipelines/test_deberta_tasks.py @@ -6,11 +6,9 @@ import torch from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import DebertaV2ForMaskedLM -from modelscope.models.nlp.deberta_v2 import (DebertaV2Tokenizer, - DebertaV2TokenizerFast) from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import FillMaskPipeline -from modelscope.preprocessors import FillMaskPreprocessor +from modelscope.preprocessors import NLPPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -24,7 +22,7 @@ class DeBERTaV2TaskTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): model_dir = snapshot_download(self.model_id_deberta) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = DebertaV2ForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -40,7 +38,7 @@ class DeBERTaV2TaskTest(unittest.TestCase): # sbert print(self.model_id_deberta) model = Model.from_pretrained(self.model_id_deberta) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) pipeline_ins = pipeline( task=Tasks.fill_mask, model=model, preprocessor=preprocessor) diff --git a/tests/pipelines/test_feature_extraction.py b/tests/pipelines/test_feature_extraction.py new file mode 100644 index 00000000..39291e76 --- /dev/null +++ b/tests/pipelines/test_feature_extraction.py @@ -0,0 +1,67 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import numpy as np + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.models import Model +from modelscope.models.nlp import FeatureExtractionModel +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.pipelines.nlp import FeatureExtractionPipeline +from modelscope.preprocessors import NLPPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class FeatureExtractionTaskModelTest(unittest.TestCase, + DemoCompatibilityCheck): + + def setUp(self) -> None: + self.task = Tasks.feature_extraction + self.model_id = 'damo/pert_feature-extraction_base-test' + + sentence1 = '测试embedding' + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_direct_file_download(self): + cache_path = snapshot_download(self.model_id) + tokenizer = NLPPreprocessor(cache_path, padding=False) + model = FeatureExtractionModel.from_pretrained(self.model_id) + pipeline1 = FeatureExtractionPipeline(model, preprocessor=tokenizer) + pipeline2 = pipeline( + Tasks.feature_extraction, model=model, preprocessor=tokenizer) + result = pipeline1(input=self.sentence1) + + print(f'sentence1: {self.sentence1}\n' + f'pipeline1:{np.shape(result[OutputKeys.TEXT_EMBEDDING])}') + result = pipeline2(input=self.sentence1) + print(f'sentence1: {self.sentence1}\n' + f'pipeline1: {np.shape(result[OutputKeys.TEXT_EMBEDDING])}') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) + tokenizer = NLPPreprocessor(model.model_dir, padding=False) + pipeline_ins = pipeline( + task=Tasks.feature_extraction, model=model, preprocessor=tokenizer) + result = pipeline_ins(input=self.sentence1) + print(np.shape(result[OutputKeys.TEXT_EMBEDDING])) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name(self): + pipeline_ins = pipeline( + task=Tasks.feature_extraction, model=self.model_id) + result = pipeline_ins(input=self.sentence1) + print(np.shape(result[OutputKeys.TEXT_EMBEDDING])) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_default_model(self): + pipeline_ins = pipeline(task=Tasks.feature_extraction) + result = pipeline_ins(input=self.sentence1) + print(np.shape(result[OutputKeys.TEXT_EMBEDDING])) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py index cec8966f..0e5e242b 100644 --- a/tests/pipelines/test_fill_mask.py +++ b/tests/pipelines/test_fill_mask.py @@ -1,13 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import unittest +from regex import R + from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM, VecoForMaskedLM) from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import FillMaskPipeline -from modelscope.preprocessors import FillMaskPreprocessor +from modelscope.preprocessors import NLPPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import MsRegressTool @@ -51,7 +53,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # sbert for language in ['zh']: model_dir = snapshot_download(self.model_id_sbert[language]) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = StructBertForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -66,7 +68,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # veco model_dir = snapshot_download(self.model_id_veco) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) model = VecoForMaskedLM.from_pretrained(model_dir) pipeline1 = FillMaskPipeline(model, preprocessor) @@ -80,13 +82,28 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n' ) + # bert + language = 'zh' + model_dir = snapshot_download(self.model_id_bert, revision='beta') + preprocessor = NLPPreprocessor( + model_dir, first_sequence='sentence', second_sequence=None) + model = Model.from_pretrained(model_dir) + pipeline1 = FillMaskPipeline(model, preprocessor) + pipeline2 = pipeline( + Tasks.fill_mask, model=model, preprocessor=preprocessor) + ori_text = self.ori_texts[language] + test_input = self.test_inputs[language] + print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: ' + f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): + # sbert for language in ['zh']: print(self.model_id_sbert[language]) model = Model.from_pretrained(self.model_id_sbert[language]) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) @@ -100,7 +117,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): # veco model = Model.from_pretrained(self.model_id_veco) - preprocessor = FillMaskPreprocessor( + preprocessor = NLPPreprocessor( model.model_dir, first_sequence='sentence', second_sequence=None) pipeline_ins = pipeline( Tasks.fill_mask, model=model, preprocessor=preprocessor) @@ -113,6 +130,18 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' f'{pipeline_ins(test_input)}\n') + # bert + language = 'zh' + model = Model.from_pretrained(self.model_id_bert, revision='beta') + preprocessor = NLPPreprocessor( + model.model_dir, first_sequence='sentence', second_sequence=None) + pipeline_ins = pipeline( + Tasks.fill_mask, model=model, preprocessor=preprocessor) + pipeline_ins.model, f'fill_mask_bert_{language}' + print( + f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' + f'{pipeline_ins(self.test_inputs[language])}\n') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): # veco @@ -131,6 +160,16 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' f'{pipeline_ins(self.test_inputs[language])}\n') + # Bert + language = 'zh' + pipeline_ins = pipeline( + task=Tasks.fill_mask, + model=self.model_id_bert, + model_revision='beta') + print( + f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' + f'{pipeline_ins(self.test_inputs[language])}\n') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.fill_mask) diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py index 9fae2d09..3658cf3f 100644 --- a/tests/pipelines/test_named_entity_recognition.py +++ b/tests/pipelines/test_named_entity_recognition.py @@ -7,7 +7,7 @@ from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition, TransformerCRFForNamedEntityRecognition) from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline -from modelscope.preprocessors import NERPreprocessor +from modelscope.preprocessors import TokenClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -26,7 +26,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_tcrf_by_direct_model_download(self): cache_path = snapshot_download(self.tcrf_model_id) - tokenizer = NERPreprocessor(cache_path) + tokenizer = TokenClassificationPreprocessor(cache_path) model = TransformerCRFForNamedEntityRecognition( cache_path, tokenizer=tokenizer) pipeline1 = NamedEntityRecognitionPipeline( @@ -43,7 +43,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_lcrf_by_direct_model_download(self): cache_path = snapshot_download(self.lcrf_model_id) - tokenizer = NERPreprocessor(cache_path) + tokenizer = TokenClassificationPreprocessor(cache_path) model = LSTMCRFForNamedEntityRecognition( cache_path, tokenizer=tokenizer) pipeline1 = NamedEntityRecognitionPipeline( @@ -60,7 +60,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_tcrf_with_model_from_modelhub(self): model = Model.from_pretrained(self.tcrf_model_id) - tokenizer = NERPreprocessor(model.model_dir) + tokenizer = TokenClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=model, @@ -70,7 +70,7 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_lcrf_with_model_from_modelhub(self): model = Model.from_pretrained(self.lcrf_model_id) - tokenizer = NERPreprocessor(model.model_dir) + tokenizer = TokenClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=model, diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py index a53ac3b3..db4b9912 100644 --- a/tests/pipelines/test_nli.py +++ b/tests/pipelines/test_nli.py @@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import PairSentenceClassificationPipeline -from modelscope.preprocessors import PairSentenceClassificationPreprocessor +from modelscope.pipelines.nlp import SequenceClassificationPipeline +from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import MsRegressTool @@ -26,9 +26,9 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): cache_path = snapshot_download(self.model_id) - tokenizer = PairSentenceClassificationPreprocessor(cache_path) + tokenizer = SequenceClassificationPreprocessor(cache_path) model = SbertForSequenceClassification.from_pretrained(cache_path) - pipeline1 = PairSentenceClassificationPipeline( + pipeline1 = SequenceClassificationPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer) print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n' @@ -40,7 +40,7 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = PairSentenceClassificationPreprocessor(model.model_dir) + tokenizer = SequenceClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.nli, model=model, preprocessor=tokenizer) print(pipeline_ins(input=(self.sentence1, self.sentence2))) diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py index 4079455d..288d38c7 100644 --- a/tests/pipelines/test_sentence_similarity.py +++ b/tests/pipelines/test_sentence_similarity.py @@ -5,8 +5,8 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import PairSentenceClassificationPipeline -from modelscope.preprocessors import PairSentenceClassificationPreprocessor +from modelscope.pipelines.nlp import SequenceClassificationPipeline +from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.regress_test_utils import MsRegressTool @@ -26,9 +26,9 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run(self): cache_path = snapshot_download(self.model_id) - tokenizer = PairSentenceClassificationPreprocessor(cache_path) + tokenizer = SequenceClassificationPreprocessor(cache_path) model = SbertForSequenceClassification.from_pretrained(cache_path) - pipeline1 = PairSentenceClassificationPipeline( + pipeline1 = SequenceClassificationPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline( Tasks.sentence_similarity, model=model, preprocessor=tokenizer) @@ -43,7 +43,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - tokenizer = PairSentenceClassificationPreprocessor(model.model_dir) + tokenizer = SequenceClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.sentence_similarity, model=model, diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py index 3db9971a..d0b1b40f 100644 --- a/tests/pipelines/test_sentiment_classification.py +++ b/tests/pipelines/test_sentiment_classification.py @@ -6,8 +6,8 @@ from modelscope.models import Model from modelscope.models.nlp.task_models.sequence_classification import \ SequenceClassificationModel from modelscope.pipelines import pipeline -from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline -from modelscope.preprocessors import SingleSentenceClassificationPreprocessor +from modelscope.pipelines.nlp import SequenceClassificationPipeline +from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -17,23 +17,21 @@ class SentimentClassificationTaskModelTest(unittest.TestCase, DemoCompatibilityCheck): def setUp(self) -> None: - self.task = Tasks.sentiment_classification + self.task = Tasks.text_classification self.model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_direct_file_download(self): - cache_path = snapshot_download(self.model_id) - tokenizer = SingleSentenceClassificationPreprocessor(cache_path) + cache_path = snapshot_download(self.model_id, revision='beta') + tokenizer = SequenceClassificationPreprocessor(cache_path) model = SequenceClassificationModel.from_pretrained( - self.model_id, num_labels=2) - pipeline1 = SingleSentenceClassificationPipeline( + self.model_id, num_labels=2, revision='beta') + pipeline1 = SequenceClassificationPipeline( model, preprocessor=tokenizer) pipeline2 = pipeline( - Tasks.sentiment_classification, - model=model, - preprocessor=tokenizer) + Tasks.text_classification, model=model, preprocessor=tokenizer) print(f'sentence1: {self.sentence1}\n' f'pipeline1:{pipeline1(input=self.sentence1)}') print(f'sentence1: {self.sentence1}\n' @@ -41,10 +39,10 @@ class SentimentClassificationTaskModelTest(unittest.TestCase, @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): - model = Model.from_pretrained(self.model_id) - tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir) + model = Model.from_pretrained(self.model_id, revision='beta') + tokenizer = SequenceClassificationPreprocessor(model.model_dir) pipeline_ins = pipeline( - task=Tasks.sentiment_classification, + task=Tasks.text_classification, model=model, preprocessor=tokenizer) print(pipeline_ins(input=self.sentence1)) @@ -54,14 +52,17 @@ class SentimentClassificationTaskModelTest(unittest.TestCase, @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): pipeline_ins = pipeline( - task=Tasks.sentiment_classification, model=self.model_id) + task=Tasks.text_classification, + model=self.model_id, + model_revision='beta') print(pipeline_ins(input=self.sentence1)) self.assertTrue( isinstance(pipeline_ins.model, SequenceClassificationModel)) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_default_model(self): - pipeline_ins = pipeline(task=Tasks.sentiment_classification) + pipeline_ins = pipeline( + task=Tasks.text_classification, model_revision='beta') print(pipeline_ins(input=self.sentence1)) self.assertTrue( isinstance(pipeline_ins.model, SequenceClassificationModel)) diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py index 71b9f3e2..39dbac99 100644 --- a/tests/pipelines/test_text_classification.py +++ b/tests/pipelines/test_text_classification.py @@ -12,6 +12,7 @@ from modelscope.utils.test_utils import test_level class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck): + sentence1 = 'i like this wonderful place' def setUp(self) -> None: self.model_id = 'damo/bert-base-sst2' @@ -46,7 +47,8 @@ class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.text_classification, model=model, preprocessor=preprocessor) - self.predict(pipeline_ins) + print(f'sentence1: {self.sentence1}\n' + f'pipeline1:{pipeline_ins(input=self.sentence1)}') # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @unittest.skip('nlp model does not support tensor input, skipped') diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py index 4271e201..f9f4d93f 100644 --- a/tests/preprocessors/test_nlp.py +++ b/tests/preprocessors/test_nlp.py @@ -32,6 +32,82 @@ class NLPPreprocessorTest(unittest.TestCase): output['attention_mask'], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + def test_token_classification_tokenize(self): + with self.subTest(tokenizer_type='bert'): + cfg = dict( + type='token-cls-tokenizer', + model_dir='bert-base-cased', + label2id={ + 'O': 0, + 'B': 1, + 'I': 2 + }) + preprocessor = build_preprocessor(cfg, Fields.nlp) + input = 'Do not meddle in the affairs of wizards, ' \ + 'for they are subtle and quick to anger.' + output = preprocessor(input) + self.assertTrue(InputFields.text in output) + self.assertEqual(output['input_ids'].tolist()[0], [ + 101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, + 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, + 119, 102 + ]) + self.assertEqual(output['attention_mask'].tolist()[0], [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1 + ]) + self.assertEqual(output['label_mask'].tolist()[0], [ + False, True, True, True, False, True, True, True, True, True, + False, True, True, True, True, True, True, True, True, True, + True, False + ]) + self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6), + (7, 13), (14, 16), + (17, 20), (21, 28), + (29, 31), (32, 39), + (39, 40), (41, 44), + (45, 49), (50, 53), + (54, 60), (61, 64), + (65, 70), (71, 73), + (74, 79), (79, 80)]) + + with self.subTest(tokenizer_type='roberta'): + cfg = dict( + type='token-cls-tokenizer', + model_dir='xlm-roberta-base', + label2id={ + 'O': 0, + 'B': 1, + 'I': 2 + }) + preprocessor = build_preprocessor(cfg, Fields.nlp) + input = 'Do not meddle in the affairs of wizards, ' \ + 'for they are subtle and quick to anger.' + output = preprocessor(input) + self.assertTrue(InputFields.text in output) + self.assertEqual(output['input_ids'].tolist()[0], [ + 0, 984, 959, 128, 19298, 23, 70, 103086, 7, 111, 6, 44239, + 99397, 4, 100, 1836, 621, 1614, 17991, 136, 63773, 47, 348, 56, + 5, 2 + ]) + self.assertEqual(output['attention_mask'].tolist()[0], [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + ]) + self.assertEqual(output['label_mask'].tolist()[0], [ + False, True, True, True, False, True, True, True, False, True, + True, False, False, False, True, True, True, True, False, True, + True, True, True, False, False, False + ]) + self.assertEqual(output['offset_mapping'], [(0, 2), (3, 6), + (7, 13), (14, 16), + (17, 20), (21, 28), + (29, 31), (32, 40), + (41, 44), (45, 49), + (50, 53), (54, 60), + (61, 64), (65, 70), + (71, 73), (74, 80)]) + if __name__ == '__main__': unittest.main() diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py index de99a7b8..9a8ab828 100644 --- a/tests/utils/test_ast.py +++ b/tests/utils/test_ast.py @@ -30,7 +30,7 @@ class AstScaningTest(unittest.TestCase): def test_ast_scaning_class(self): astScaner = AstScaning() pipeline_file = os.path.join(MODELSCOPE_PATH, 'pipelines', 'nlp', - 'sequence_classification_pipeline.py') + 'text_generation_pipeline.py') output = astScaner.generate_ast(pipeline_file) self.assertTrue(output['imports'] is not None) self.assertTrue(output['from_imports'] is not None) @@ -40,14 +40,12 @@ class AstScaningTest(unittest.TestCase): self.assertIsInstance(imports, dict) self.assertIsInstance(from_imports, dict) self.assertIsInstance(decorators, list) - self.assertListEqual( - list(set(imports.keys()) - set(['typing', 'numpy'])), []) - self.assertEqual(len(from_imports.keys()), 9) + self.assertListEqual(list(set(imports.keys()) - set(['torch'])), []) + self.assertEqual(len(from_imports.keys()), 7) self.assertTrue(from_imports['modelscope.metainfo'] is not None) self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines']) - self.assertEqual( - decorators, - [('PIPELINES', 'text-classification', 'sentiment-analysis')]) + self.assertEqual(decorators, + [('PIPELINES', 'text-generation', 'text-generation')]) def test_files_scaning_method(self): fileScaner = FilesAstScaning() From 91231b3c157ac875f67e2bbd420a8810da0c0e36 Mon Sep 17 00:00:00 2001 From: ly261666 Date: Tue, 27 Sep 2022 23:09:13 +0800 Subject: [PATCH 18/23] [to #42322933]add copyright on mogface,retinaface,mtcnn,ulfd pipeline Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10266086 --- modelscope/pipelines/cv/mog_face_detection_pipeline.py | 1 + modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py | 1 + modelscope/pipelines/cv/retina_face_detection_pipeline.py | 1 + modelscope/pipelines/cv/ulfd_face_detection_pipeline.py | 1 + 4 files changed, 4 insertions(+) diff --git a/modelscope/pipelines/cv/mog_face_detection_pipeline.py b/modelscope/pipelines/cv/mog_face_detection_pipeline.py index 8797ad12..124b605b 100644 --- a/modelscope/pipelines/cv/mog_face_detection_pipeline.py +++ b/modelscope/pipelines/cv/mog_face_detection_pipeline.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp from typing import Any, Dict diff --git a/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py index 57bf9920..bda46a70 100644 --- a/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py +++ b/modelscope/pipelines/cv/mtcnn_face_detection_pipeline.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp from typing import Any, Dict diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py index b8c64405..40f2336a 100644 --- a/modelscope/pipelines/cv/retina_face_detection_pipeline.py +++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp from typing import Any, Dict diff --git a/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py index 1263082b..e9901d64 100644 --- a/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py +++ b/modelscope/pipelines/cv/ulfd_face_detection_pipeline.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp from typing import Any, Dict From 3d41d6d6208edfcdb7cf7c00c571e0579405cde7 Mon Sep 17 00:00:00 2001 From: "tianchu.gtc" Date: Tue, 27 Sep 2022 23:22:46 +0800 Subject: [PATCH 19/23] [to #42322933] fix seg4demo Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10189886 --- .../image_panoptic_segmentation/panseg_model.py | 3 +-- .../pan_merge/__init__.py | 1 + .../pan_merge/maskformer_semantic_head.py | 1 + .../semantic_seg_model.py | 1 + .../vit_adapter/__init__.py | 2 ++ .../vit_adapter/models/__init__.py | 2 ++ .../vit_adapter/models/backbone/__init__.py | 2 ++ .../models/backbone/adapter_modules.py | 17 ++++++++--------- .../models/backbone/base/__init__.py | 2 ++ .../vit_adapter/models/backbone/base/beit.py | 6 ++---- .../vit_adapter/models/backbone/beit_adapter.py | 13 ++++++------- .../vit_adapter/models/decode_heads/__init__.py | 2 ++ .../models/decode_heads/base_decode_head.py | 5 ++--- .../decode_heads/mask2former_head_from_mmseg.py | 5 ++--- .../vit_adapter/models/segmentors/__init__.py | 2 ++ .../models/segmentors/base_segmentor.py | 5 ++--- .../segmentors/encoder_decoder_mask2former.py | 5 ++--- .../vit_adapter/utils/__init__.py | 2 ++ .../vit_adapter/utils/builder.py | 5 ++--- .../vit_adapter/utils/seg_func.py | 5 ++--- .../cv/image_panoptic_segmentation_pipeline.py | 16 +++++++--------- .../cv/image_semantic_segmentation_pipeline.py | 17 ++++++----------- 22 files changed, 59 insertions(+), 60 deletions(-) diff --git a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py index f9022f90..f44c01e8 100644 --- a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py +++ b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp import torch @@ -49,6 +50,4 @@ class SwinLPanopticSegmentation(TorchModel): return results def forward(self, Inputs): - import pdb - pdb.set_trace() return self.model(**Inputs) diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py index 2a75f318..6a31a308 100644 --- a/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py +++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py @@ -1 +1,2 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from .maskformer_semantic_head import MaskFormerSemanticHead diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py index 6769ebaf..2f3364d0 100644 --- a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py +++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import torch import torch.nn.functional as F from mmdet.models.builder import HEADS diff --git a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py index 60acf28f..2b38ebad 100644 --- a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py +++ b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp import numpy as np diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py index 82eec1c6..3b9a301c 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py @@ -1,3 +1,5 @@ +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git from .models import backbone, decode_heads, segmentors from .utils import (ResizeToMultiple, add_prefix, build_pixel_sampler, seg_resize) diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py index ae5c5acf..791dd26f 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py @@ -1,3 +1,5 @@ +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git from .backbone import BASEBEiT, BEiTAdapter from .decode_heads import Mask2FormerHeadFromMMSeg from .segmentors import EncoderDecoderMask2Former diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py index ab4258c1..7abd0ef1 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py @@ -1,3 +1,5 @@ +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git from .base import BASEBEiT from .beit_adapter import BEiTAdapter diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py index 03080342..cf30cca0 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py @@ -1,6 +1,5 @@ -# The implementation refers to the VitAdapter -# available at -# https://github.com/czczup/ViT-Adapter.git +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git import logging from functools import partial @@ -417,7 +416,7 @@ class SpatialPriorModule(nn.Module): self.stem = nn.Sequential(*[ nn.Conv2d( 3, inplanes, kernel_size=3, stride=2, padding=1, bias=False), - nn.SyncBatchNorm(inplanes), + nn.BatchNorm2d(inplanes), nn.ReLU(inplace=True), nn.Conv2d( inplanes, @@ -426,7 +425,7 @@ class SpatialPriorModule(nn.Module): stride=1, padding=1, bias=False), - nn.SyncBatchNorm(inplanes), + nn.BatchNorm2d(inplanes), nn.ReLU(inplace=True), nn.Conv2d( inplanes, @@ -435,7 +434,7 @@ class SpatialPriorModule(nn.Module): stride=1, padding=1, bias=False), - nn.SyncBatchNorm(inplanes), + nn.BatchNorm2d(inplanes), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2, padding=1) ]) @@ -447,7 +446,7 @@ class SpatialPriorModule(nn.Module): stride=2, padding=1, bias=False), - nn.SyncBatchNorm(2 * inplanes), + nn.BatchNorm2d(2 * inplanes), nn.ReLU(inplace=True) ]) self.conv3 = nn.Sequential(*[ @@ -458,7 +457,7 @@ class SpatialPriorModule(nn.Module): stride=2, padding=1, bias=False), - nn.SyncBatchNorm(4 * inplanes), + nn.BatchNorm2d(4 * inplanes), nn.ReLU(inplace=True) ]) self.conv4 = nn.Sequential(*[ @@ -469,7 +468,7 @@ class SpatialPriorModule(nn.Module): stride=2, padding=1, bias=False), - nn.SyncBatchNorm(4 * inplanes), + nn.BatchNorm2d(4 * inplanes), nn.ReLU(inplace=True) ]) self.fc1 = nn.Conv2d( diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py index 40b0fa89..5b33031f 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py @@ -1,3 +1,5 @@ +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git from .beit import BASEBEiT __all__ = ['BASEBEiT'] diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py index a5811fb9..62f873ec 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py @@ -1,7 +1,5 @@ -# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) -# Github source: https://github.com/microsoft/unilm/tree/master/beit -# This implementation refers to -# https://github.com/czczup/ViT-Adapter.git +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git import math from functools import partial diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py index 02a4968e..182fc0c1 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py @@ -1,6 +1,5 @@ -# The implementation refers to the VitAdapter -# available at -# https://github.com/czczup/ViT-Adapter.git +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git import logging import math @@ -69,10 +68,10 @@ class BEiTAdapter(BASEBEiT): ]) self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2) - self.norm1 = nn.SyncBatchNorm(embed_dim) - self.norm2 = nn.SyncBatchNorm(embed_dim) - self.norm3 = nn.SyncBatchNorm(embed_dim) - self.norm4 = nn.SyncBatchNorm(embed_dim) + self.norm1 = nn.BatchNorm2d(embed_dim) + self.norm2 = nn.BatchNorm2d(embed_dim) + self.norm3 = nn.BatchNorm2d(embed_dim) + self.norm4 = nn.BatchNorm2d(embed_dim) self.up.apply(self._init_weights) self.spm.apply(self._init_weights) diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py index 9367806f..12bf2a21 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py @@ -1,3 +1,5 @@ +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git from .mask2former_head_from_mmseg import Mask2FormerHeadFromMMSeg __all__ = ['Mask2FormerHeadFromMMSeg'] diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py index 36660520..ae7a0416 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py @@ -1,6 +1,5 @@ -# The implementation refers to the VitAdapter -# available at -# https://github.com/czczup/ViT-Adapter.git +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git from abc import ABCMeta, abstractmethod import torch diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py index ad8b1586..c0681d2b 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py @@ -1,6 +1,5 @@ -# The implementation refers to the VitAdapter -# available at -# https://github.com/czczup/ViT-Adapter.git +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git import copy diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py index 1f2c8b04..18bbce0d 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py @@ -1,3 +1,5 @@ +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git from .encoder_decoder_mask2former import EncoderDecoderMask2Former __all__ = ['EncoderDecoderMask2Former'] diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py index 8bd8fa3f..311352c2 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py @@ -1,6 +1,5 @@ -# The implementation refers to the VitAdapter -# available at -# https://github.com/czczup/ViT-Adapter.git +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git import warnings from abc import ABCMeta, abstractmethod from collections import OrderedDict diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py index 9287e8aa..50492374 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py @@ -1,6 +1,5 @@ -# The implementation refers to the VitAdapter -# available at -# https://github.com/czczup/ViT-Adapter.git +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git import torch import torch.nn as nn import torch.nn.functional as F diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py index dec8a5f2..9c4d5c4c 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py @@ -1,3 +1,5 @@ +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git from .builder import build_pixel_sampler from .data_process_func import ResizeToMultiple from .seg_func import add_prefix, seg_resize diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py index 63d77fea..0603ef94 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py @@ -1,6 +1,5 @@ -# The implementation refers to the VitAdapter -# available at -# https://github.com/czczup/ViT-Adapter.git +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git from mmcv.utils import Registry, build_from_cfg PIXEL_SAMPLERS = Registry('pixel sampler') diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py index fba46b81..db564cca 100644 --- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py +++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py @@ -1,6 +1,5 @@ -# The implementation refers to the VitAdapter -# available at -# https://github.com/czczup/ViT-Adapter.git +# The implementation is adopted from VitAdapter, +# made publicly available under the Apache License at https://github.com/czczup/ViT-Adapter.git import warnings diff --git a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py index 9ffc2b03..b96e709c 100644 --- a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py +++ b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py @@ -4,11 +4,13 @@ from typing import Any, Dict, Union import cv2 import numpy as np import PIL +import torch from modelscope.metainfo import Pipelines from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, Pipeline from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import load_image from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger @@ -39,28 +41,24 @@ class ImagePanopticSegmentationPipeline(Pipeline): # build the data pipeline if isinstance(input, str): - # input is str, file names, pipeline loadimagefromfile - # collect data - data = dict(img_info=dict(filename=input), img_prefix=None) + cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam' + img = np.array(load_image(input)) + img = img[:, :, ::-1] # convert to bgr elif isinstance(input, PIL.Image.Image): cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam' img = np.array(input.convert('RGB')) - # collect data - data = dict(img=img) elif isinstance(input, np.ndarray): cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam' if len(input.shape) == 2: img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR) else: img = input - img = img[:, :, ::-1] # in rgb order - # collect data - data = dict(img=img) - else: raise TypeError(f'input should be either str, PIL.Image,' f' np.array, but got {type(input)}') + # collect data + data = dict(img=img) cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline) test_pipeline = Compose(cfg.data.test.pipeline) diff --git a/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py index e3e1fd6b..023d9712 100644 --- a/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py +++ b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py @@ -10,6 +10,7 @@ from modelscope.metainfo import Pipelines from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, Model, Pipeline from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import load_image from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger @@ -40,28 +41,24 @@ class ImageSemanticSegmentationPipeline(Pipeline): # build the data pipeline if isinstance(input, str): - # input is str, file names, pipeline loadimagefromfile - # collect data - data = dict(img_info=dict(filename=input), img_prefix=None) + cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam' + img = np.array(load_image(input)) + img = img[:, :, ::-1] # convert to bgr elif isinstance(input, PIL.Image.Image): # BGR cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam' img = np.array(input)[:, :, ::-1] - # collect data - data = dict(img=img) elif isinstance(input, np.ndarray): cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam' if len(input.shape) == 2: img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR) else: img = input - # collect data - data = dict(img=img) - else: raise TypeError(f'input should be either str, PIL.Image,' f' np.array, but got {type(input)}') - # data = dict(img=input) + # collect data + data = dict(img=img) cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline) test_pipeline = Compose(cfg.data.test.pipeline) @@ -80,11 +77,9 @@ class ImageSemanticSegmentationPipeline(Pipeline): def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: results = self.model.inference(input) - return results def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - results = self.model.postprocess(inputs) outputs = { OutputKeys.MASKS: results[OutputKeys.MASKS], From a3598f8d8c09ced380c9393d5c5208ef65aa13dd Mon Sep 17 00:00:00 2001 From: "hemu.zp" Date: Tue, 27 Sep 2022 23:24:58 +0800 Subject: [PATCH 20/23] [to #42322933] Fix rouge metrics for chinese text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修复 TextGenerationMetric 中 Rouge 指标计算中文时结果不正确的问题 为文本生成添加 BLEU 指标 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10254323 --- modelscope/metrics/builder.py | 4 ++ modelscope/metrics/text_generation_metric.py | 62 +++++++++++++++----- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py index 800e3508..9e875cc4 100644 --- a/modelscope/metrics/builder.py +++ b/modelscope/metrics/builder.py @@ -18,6 +18,10 @@ class MetricKeys(object): SSIM = 'ssim' AVERAGE_LOSS = 'avg_loss' FScore = 'fscore' + BLEU_1 = 'bleu-1' + BLEU_4 = 'bleu-4' + ROUGE_1 = 'rouge-1' + ROUGE_L = 'rouge-l' task_default_metrics = { diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py index f154281d..90b80425 100644 --- a/modelscope/metrics/text_generation_metric.py +++ b/modelscope/metrics/text_generation_metric.py @@ -1,11 +1,14 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Dict +from typing import Dict, Iterable, List + +from nltk.translate.bleu_score import sentence_bleu +from rouge import Rouge from modelscope.metainfo import Metrics +from modelscope.metrics.base import Metric +from modelscope.metrics.builder import METRICS, MetricKeys from modelscope.utils.registry import default_group -from .base import Metric -from .builder import METRICS, MetricKeys @METRICS.register_module( @@ -17,20 +20,49 @@ class TextGenerationMetric(Metric): """ def __init__(self): - self.preds = [] - self.tgts = [] - from rouge_score import rouge_scorer - self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) + self.preds: List[str] = [] + self.tgts: List[str] = [] + self.rouge = Rouge() - def add(self, outputs: Dict, inputs: Dict): + @staticmethod + def is_chinese_char(char: str): + # the length of char must be 1 + return '\u4e00' <= char <= '\u9fa5' + + # add space for each chinese char + def rebuild_str(self, string: str): + return ' '.join(''.join([ + f' {char} ' if self.is_chinese_char(char) else char + for char in string + ]).split()) + + def add(self, outputs: Dict[str, List[str]], inputs: Dict = None): ground_truths = outputs['tgts'] eval_results = outputs['preds'] - self.preds.extend(eval_results) - self.tgts.extend(ground_truths) + for truth in ground_truths: + self.tgts.append(self.rebuild_str(truth)) + for result in eval_results: + self.preds.append(self.rebuild_str(result)) def evaluate(self): - scores = [ - self.scorer.score(pred, tgt)['rougeL'].fmeasure - for pred, tgt in zip(self.preds, self.tgts) - ] - return {MetricKeys.F1: sum(scores) / len(scores)} + + def mean(iter: Iterable) -> float: + return sum(iter) / len(self.preds) + + rouge_scores = self.rouge.get_scores(hyps=self.preds, refs=self.tgts) + rouge_1 = mean(map(lambda score: score['rouge-1']['f'], rouge_scores)) + rouge_l = mean(map(lambda score: score['rouge-l']['f'], rouge_scores)) + pred_split = tuple(pred.split(' ') for pred in self.preds) + tgt_split = tuple(tgt.split(' ') for tgt in self.tgts) + bleu_1 = mean( + sentence_bleu([tgt], pred, weights=(1, 0, 0, 0)) + for pred, tgt in zip(pred_split, tgt_split)) + bleu_4 = mean( + sentence_bleu([tgt], pred) + for pred, tgt in zip(pred_split, tgt_split)) + return { + MetricKeys.ROUGE_1: rouge_1, + MetricKeys.ROUGE_L: rouge_l, + MetricKeys.BLEU_1: bleu_1, + MetricKeys.BLEU_4: bleu_4 + } From 11b33164c33cc3fae3a195037a278c3cb87484a6 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Wed, 28 Sep 2022 09:26:44 +0800 Subject: [PATCH 21/23] [to #42322933] disable t5 test temporarily --- tests/pipelines/test_text2text_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py index 04cecf93..a39562f5 100644 --- a/tests/pipelines/test_text2text_generation.py +++ b/tests/pipelines/test_text2text_generation.py @@ -30,7 +30,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}' ) - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_pipeline_with_model_instance(self): model = Model.from_pretrained(self.model_id) preprocessor = Text2TextGenerationPreprocessor(model.model_dir) @@ -40,7 +40,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): preprocessor=preprocessor) print(pipeline_ins(self.input)) - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_pipeline_with_model_id(self): pipeline_ins = pipeline( task=Tasks.text2text_generation, model=self.model_id) From c51b74c2ea6f2c736955a34599a745b2cd0d02a3 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Wed, 28 Sep 2022 13:36:09 +0800 Subject: [PATCH 22/23] [to #45220645]fix: fix ffmpeg mp4 encoder bug Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10284398 * [to #45220645]fix: fix ffmpeg mp4 encoder bug --- docker/Dockerfile.ubuntu | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index e0bfa908..a9a409b5 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -34,7 +34,8 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${a cp /tmp/resources/conda.tuna ~/.condarc && \ source /root/.bashrc && \ conda install --yes python==${PYTHON_VERSION} && \ - pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ + pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn ARG USE_GPU=True @@ -42,15 +43,15 @@ ARG USE_GPU=True ARG TORCH_VERSION=1.12.0 ARG CUDATOOLKIT_VERSION=11.3 RUN if [ "$USE_GPU" = "True" ] ; then \ - conda install --yes pytorch==$TORCH_VERSION torchvision torchaudio cudatoolkit=$CUDATOOLKIT_VERSION -c pytorch && conda clean --yes --all; \ + pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113; \ else \ - conda install pytorch==$TORCH_VERSION torchvision torchaudio cpuonly -c pytorch; \ + pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu; \ fi # install tensorflow ARG TENSORFLOW_VERSION=1.15.5 RUN if [ "$USE_GPU" = "True" ] ; then \ - pip install --no-cache-dir --use-deprecated=legacy-resolver tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \ + pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \ else \ pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \ fi @@ -75,9 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \ ENV SHELL=/bin/bash # install special package -RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 ipykernel && \ - pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ - pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn +RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq RUN if [ "$USE_GPU" = "True" ] ; then \ pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \ From 0e52a20d2889bca5c0f8165d3013bd46de4afccc Mon Sep 17 00:00:00 2001 From: "chaojie.mcj" Date: Wed, 28 Sep 2022 14:30:37 +0800 Subject: [PATCH 23/23] [to #42322933]update license MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 以下算法进行了header变更: modelscope.models.cv.cmdssl_video_embedding modelscope.models.cv.action_recognition modelscope.models.cv.animal_recognition modelscope.models.multi_modal.multi_stage_diffusion modelscope.models.multi_modal.gemm modelscope.pipelines.cv.live_category_pipeline modelscope.pipelines.cv.video_category_pipeline modelscope.models.cv.image_to_image_translation modelscope.models.cv.image_to_image_generation modelscope.models.cv.video_inpainting modelscope.models.multi_modal.diffusion modelscope.models.multi_modal.team modelscope.models.cv.shop_segmentation modelscope.models.cv.text_driven_segmentation modelscope.models.cv.action_recognition modelscope.models.cv.face_emotion modelscope.models.cv.hand_static Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10268474 --- .../models/cv/action_recognition/models.py | 3 +++ modelscope/models/cv/action_recognition/s3dg.py | 3 +++ .../cv/action_recognition/tada_convnext.py | 4 ++++ .../models/cv/animal_recognition/resnet.py | 3 +++ .../models/cv/animal_recognition/splat.py | 3 +++ .../cv/cmdssl_video_embedding/__init__.py | 3 ++- .../models/cv/cmdssl_video_embedding/c3d.py | 8 ++++++++ .../cv/cmdssl_video_embedding/resnet2p1d.py | 8 ++++++++ .../cv/cmdssl_video_embedding/resnet3d.py | 8 ++++++++ .../models/cv/shop_segmentation/common.py | 14 ++++++-------- .../models/cv/shop_segmentation/head_fpn.py | 14 ++++++-------- .../models/cv/shop_segmentation/models.py | 14 ++++++-------- .../models/cv/shop_segmentation/neck_fpn.py | 14 ++++++-------- .../cv/shop_segmentation/shop_seg_base.py | 14 ++++++-------- .../cv/shop_segmentation/shop_seg_model.py | 2 ++ modelscope/models/cv/shop_segmentation/utils.py | 7 +++---- .../cv/text_driven_segmentation/__init__.py | 1 + .../models/cv/text_driven_segmentation/clip.py | 7 +++---- .../cv/text_driven_segmentation/lseg_base.py | 6 ++---- .../cv/text_driven_segmentation/lseg_blocks.py | 6 ++---- .../cv/text_driven_segmentation/lseg_model.py | 2 ++ .../cv/text_driven_segmentation/lseg_net.py | 6 ++---- .../cv/text_driven_segmentation/lseg_vit.py | 6 ++---- .../models/cv/text_driven_segmentation/model.py | 6 ++---- .../simple_tokenizer.py | 7 +++---- .../models/multi_modal/diffusion/diffusion.py | 3 +++ .../models/multi_modal/diffusion/model.py | 1 + .../multi_modal/diffusion/unet_generator.py | 3 +++ .../diffusion/unet_upsampler_1024.py | 3 +++ .../multi_modal/diffusion/unet_upsampler_256.py | 3 +++ modelscope/models/multi_modal/gemm/gemm_base.py | 17 +++++++++++------ .../models/multi_modal/gemm/gemm_model.py | 2 ++ modelscope/models/multi_modal/gemm/tokenizer.py | 12 ++++++++---- modelscope/models/multi_modal/mmr/__init__.py | 2 ++ .../mmr/dataloaders/rawvideo_util.py | 3 +++ .../models/multi_modal/mmr/models/__init__.py | 2 ++ .../mmr/models/clip_for_mm_video_embedding.py | 3 +++ .../mmr/models/dynamic_inverted_softmax.py | 3 +++ .../models/multi_modal/mmr/models/modeling.py | 2 ++ .../multi_modal/mmr/models/module_clip.py | 3 ++- .../multi_modal/mmr/models/module_cross.py | 3 +++ .../multi_modal/mmr/models/tokenization_clip.py | 3 +++ .../multi_modal/multi_stage_diffusion/clip.py | 3 ++- .../multi_stage_diffusion/decoder.py | 2 +- .../multi_stage_diffusion/gaussian_diffusion.py | 5 +++-- .../multi_modal/multi_stage_diffusion/model.py | 2 +- .../multi_modal/multi_stage_diffusion/prior.py | 2 +- .../multi_stage_diffusion/tokenizer.py | 3 ++- .../multi_stage_diffusion/upsampler.py | 2 +- .../multi_modal/multi_stage_diffusion/xglm.py | 5 +++-- .../models/multi_modal/team/team_model.py | 1 + modelscope/models/multi_modal/team/utils.py | 11 +++++++---- .../pipelines/cv/animal_recognition_pipeline.py | 1 + .../cv/cmdssl_video_embedding_pipeline.py | 2 ++ .../cv/general_recognition_pipeline.py | 1 + .../pipelines/cv/live_category_pipeline.py | 2 +- .../pipelines/cv/shop_segmentation_pipleline.py | 1 + .../cv/text_driven_segmentation_pipleline.py | 1 + .../pipelines/cv/video_category_pipeline.py | 2 +- ...generative_multi_modal_embedding_pipeline.py | 2 +- .../team_multi_modal_similarity_pipeline.py | 3 +-- tests/pipelines/test_cmdssl_video_embedding.py | 2 +- .../test_generative_multi_modal_embedding.py | 2 +- tests/pipelines/test_multi_modal_similarity.py | 2 +- 64 files changed, 188 insertions(+), 106 deletions(-) diff --git a/modelscope/models/cv/action_recognition/models.py b/modelscope/models/cv/action_recognition/models.py index a5964e21..f16805fb 100644 --- a/modelscope/models/cv/action_recognition/models.py +++ b/modelscope/models/cv/action_recognition/models.py @@ -1,3 +1,6 @@ +# The implementation is also open-sourced by the authors, +# and available at https://github.com/alibaba-mmai-research/TAdaConv +# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved. import torch.nn as nn from .s3dg import Inception3D diff --git a/modelscope/models/cv/action_recognition/s3dg.py b/modelscope/models/cv/action_recognition/s3dg.py index f258df16..46e76892 100644 --- a/modelscope/models/cv/action_recognition/s3dg.py +++ b/modelscope/models/cv/action_recognition/s3dg.py @@ -1,3 +1,6 @@ +# The implementation is adopted from https://github.com/TengdaHan/CoCLR, +# made pubicly available under the Apache License, Version 2.0 at https://github.com/TengdaHan/CoCLR +# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved. import torch import torch.nn as nn diff --git a/modelscope/models/cv/action_recognition/tada_convnext.py b/modelscope/models/cv/action_recognition/tada_convnext.py index 379b5271..b1de7af8 100644 --- a/modelscope/models/cv/action_recognition/tada_convnext.py +++ b/modelscope/models/cv/action_recognition/tada_convnext.py @@ -1,3 +1,7 @@ +# The implementation is adopted from https://github.com/facebookresearch/ConvNeXt, +# made pubicly available under the MIT License at https://github.com/facebookresearch/ConvNeXt +# Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved. + import math import torch diff --git a/modelscope/models/cv/animal_recognition/resnet.py b/modelscope/models/cv/animal_recognition/resnet.py index 73953de4..d7c03c29 100644 --- a/modelscope/models/cv/animal_recognition/resnet.py +++ b/modelscope/models/cv/animal_recognition/resnet.py @@ -1,3 +1,6 @@ +# The implementation is adopted from Split-Attention Network, A New ResNet Variant, +# made pubicly available under the Apache License 2.0 License +# at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/resnet.py import math import torch diff --git a/modelscope/models/cv/animal_recognition/splat.py b/modelscope/models/cv/animal_recognition/splat.py index 0aab555e..a10d0abe 100644 --- a/modelscope/models/cv/animal_recognition/splat.py +++ b/modelscope/models/cv/animal_recognition/splat.py @@ -1,3 +1,6 @@ +# The implementation is adopted from Split-Attention Network, A New ResNet Variant, +# made pubicly available under the Apache License 2.0 License +# at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/splat.py """Split-Attention""" import torch diff --git a/modelscope/models/cv/cmdssl_video_embedding/__init__.py b/modelscope/models/cv/cmdssl_video_embedding/__init__.py index e7e156a5..5bc67b63 100644 --- a/modelscope/models/cv/cmdssl_video_embedding/__init__.py +++ b/modelscope/models/cv/cmdssl_video_embedding/__init__.py @@ -1,4 +1,5 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule diff --git a/modelscope/models/cv/cmdssl_video_embedding/c3d.py b/modelscope/models/cv/cmdssl_video_embedding/c3d.py index 62f0e0b9..53dd05a1 100644 --- a/modelscope/models/cv/cmdssl_video_embedding/c3d.py +++ b/modelscope/models/cv/cmdssl_video_embedding/c3d.py @@ -1,3 +1,11 @@ +# Copyright 2022 Davide Abati. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + +# The implementation here is modified based on c3d-pytorch, +# originally MIT License, Copyright (c) 2022 Davide Abati, +# and publicly available at https://github.com/DavideA/c3d-pytorch +""" C3D Model Architecture.""" + import torch import torch.nn as nn diff --git a/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py b/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py index 3b03cc74..b49069d1 100644 --- a/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py +++ b/modelscope/models/cv/cmdssl_video_embedding/resnet2p1d.py @@ -1,3 +1,11 @@ +# Copyright (c) 2022 Kensho Hara. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + +# The implementation here is modified based on 3D-ResNets-PyTorch, +# originally MIT License, Copyright (c) 2022 Kensho Hara, +# and publicly available at https://github.com/kenshohara/3D-ResNets-PyTorch/blob/master/models/resnet2p1d.py +""" ResNet2plus1d Model Architecture.""" + import torch import torch.nn as nn diff --git a/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py b/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py index 24d50a8e..dddba06f 100644 --- a/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py +++ b/modelscope/models/cv/cmdssl_video_embedding/resnet3d.py @@ -1,3 +1,11 @@ +# Copyright (c) 2022 Kensho Hara. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + +# The implementation here is modified based on 3D-ResNets-PyTorch, +# originally MIT License, Copyright (c) 2022 Kensho Hara, +# and publicly available at https://github.com/kenshohara/3D-ResNets-PyTorch/blob/master/models/resnet.py +""" ResNet3D Model Architecture.""" + import torch import torch.nn as nn diff --git a/modelscope/models/cv/shop_segmentation/common.py b/modelscope/models/cv/shop_segmentation/common.py index 00ba9996..8cb940a5 100644 --- a/modelscope/models/cv/shop_segmentation/common.py +++ b/modelscope/models/cv/shop_segmentation/common.py @@ -1,11 +1,9 @@ -""" -Base modules are adapted from https://github.com/open-mmlab/mmcv/, -originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, -https://github.com/open-mmlab/mmsegmentation/, -originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, -and adapted from https://github.com/raoyongming/DenseCLIP/, -originally MIT License, Copyright (c) 2022 Rao, Yongming. -""" +# Base modules are adapted from https://github.com/open-mmlab/mmcv/, +# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +# https://github.com/open-mmlab/mmsegmentation/, +# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +# and adapted from https://github.com/raoyongming/DenseCLIP/, +# originally MIT License, Copyright (c) 2022 Rao, Yongming. import warnings diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py index b3faa9b8..cad389c7 100644 --- a/modelscope/models/cv/shop_segmentation/head_fpn.py +++ b/modelscope/models/cv/shop_segmentation/head_fpn.py @@ -1,11 +1,9 @@ -""" FPNHead -Base modules are adapted from https://github.com/open-mmlab/mmcv/, -originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, -https://github.com/open-mmlab/mmsegmentation/, -originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, -and adapted from https://github.com/raoyongming/DenseCLIP/, -originally MIT License, Copyright (c) 2022 Rao, Yongming. -""" +# Base modules are adapted from https://github.com/open-mmlab/mmcv/, +# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +# https://github.com/open-mmlab/mmsegmentation/, +# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +# and adapted from https://github.com/raoyongming/DenseCLIP/, +# originally MIT License, Copyright (c) 2022 Rao, Yongming. import numpy as np import torch diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py index 171aafbd..3880d074 100644 --- a/modelscope/models/cv/shop_segmentation/models.py +++ b/modelscope/models/cv/shop_segmentation/models.py @@ -1,11 +1,9 @@ -""" -Base modules are adapted from https://github.com/open-mmlab/mmcv/, -originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, -https://github.com/open-mmlab/mmsegmentation/, -originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, -and adapted from https://github.com/raoyongming/DenseCLIP/, -originally MIT License, Copyright (c) 2022 Rao, Yongming. -""" +# Base modules are adapted from https://github.com/open-mmlab/mmcv/, +# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +# https://github.com/open-mmlab/mmsegmentation/, +# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +# and adapted from https://github.com/raoyongming/DenseCLIP/, +# originally MIT License, Copyright (c) 2022 Rao, Yongming. import math from collections import OrderedDict diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py index 108cb043..aa4d7159 100644 --- a/modelscope/models/cv/shop_segmentation/neck_fpn.py +++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py @@ -1,11 +1,9 @@ -""" FPNneck -Base modules are adapted from https://github.com/open-mmlab/mmcv/, -originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, -https://github.com/open-mmlab/mmsegmentation/, -originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, -and adapted from https://github.com/raoyongming/DenseCLIP/, -originally MIT License, Copyright (c) 2022 Rao, Yongming. -""" +# Base modules are adapted from https://github.com/open-mmlab/mmcv/, +# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +# https://github.com/open-mmlab/mmsegmentation/, +# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +# and adapted from https://github.com/raoyongming/DenseCLIP/, +# originally MIT License, Copyright (c) 2022 Rao, Yongming. import torch.nn as nn import torch.nn.functional as F diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_base.py b/modelscope/models/cv/shop_segmentation/shop_seg_base.py index e3ae0d54..34686370 100644 --- a/modelscope/models/cv/shop_segmentation/shop_seg_base.py +++ b/modelscope/models/cv/shop_segmentation/shop_seg_base.py @@ -1,11 +1,9 @@ -""" -Base modules are adapted from https://github.com/open-mmlab/mmcv/, -originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, -https://github.com/open-mmlab/mmsegmentation/, -originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, -and adapted from https://github.com/raoyongming/DenseCLIP/, -originally MIT License, Copyright (c) 2022 Rao, Yongming. -""" +# Base modules are adapted from https://github.com/open-mmlab/mmcv/, +# originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +# https://github.com/open-mmlab/mmsegmentation/, +# originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +# and adapted from https://github.com/raoyongming/DenseCLIP/, +# originally MIT License, Copyright (c) 2022 Rao, Yongming. import torch import torch.nn as nn diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_model.py b/modelscope/models/cv/shop_segmentation/shop_seg_model.py index 0aeeb1de..ac0d67fa 100644 --- a/modelscope/models/cv/shop_segmentation/shop_seg_model.py +++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os.path as osp from typing import Any, Dict diff --git a/modelscope/models/cv/shop_segmentation/utils.py b/modelscope/models/cv/shop_segmentation/utils.py index c41f8a65..4035b0ef 100644 --- a/modelscope/models/cv/shop_segmentation/utils.py +++ b/modelscope/models/cv/shop_segmentation/utils.py @@ -1,7 +1,6 @@ -""" CLIP Tokenizer -Adapted from https://github.com/openai/CLIP. -Originally MIT License, Copyright (c) 2021 OpenAI. -""" +# CLIP Tokenizer +# Adapted from https://github.com/openai/CLIP. +# Originally MIT License, Copyright (c) 2021 OpenAI. import gzip import html diff --git a/modelscope/models/cv/text_driven_segmentation/__init__.py b/modelscope/models/cv/text_driven_segmentation/__init__.py index 46daad78..aefaa698 100644 --- a/modelscope/models/cv/text_driven_segmentation/__init__.py +++ b/modelscope/models/cv/text_driven_segmentation/__init__.py @@ -1 +1,2 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from .lseg_base import TextDrivenSegmentation diff --git a/modelscope/models/cv/text_driven_segmentation/clip.py b/modelscope/models/cv/text_driven_segmentation/clip.py index 440cccea..1cec5f39 100644 --- a/modelscope/models/cv/text_driven_segmentation/clip.py +++ b/modelscope/models/cv/text_driven_segmentation/clip.py @@ -1,7 +1,6 @@ -""" CLIP -Adapted from https://github.com/openai/CLIP. -Originally MIT License, Copyright (c) 2021 OpenAI. -""" +# CLIP +# Adapted from https://github.com/openai/CLIP. +# Originally MIT License, Copyright (c) 2021 OpenAI. import hashlib import os diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_base.py b/modelscope/models/cv/text_driven_segmentation/lseg_base.py index 20915396..c79861a7 100644 --- a/modelscope/models/cv/text_driven_segmentation/lseg_base.py +++ b/modelscope/models/cv/text_driven_segmentation/lseg_base.py @@ -1,7 +1,5 @@ -""" -Adapted from https://github.com/isl-org/lang-seg. -Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. -""" +# Adapted from https://github.com/isl-org/lang-seg. +# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. import torch import torch.nn as nn diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py index cb550ab7..56d4a65d 100644 --- a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py +++ b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py @@ -1,7 +1,5 @@ -""" -Adapted from https://github.com/isl-org/lang-seg. -Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. -""" +# Adapted from https://github.com/isl-org/lang-seg. +# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. import torch import torch.nn as nn diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_model.py b/modelscope/models/cv/text_driven_segmentation/lseg_model.py index 1d7ebdd1..9a5754c6 100644 --- a/modelscope/models/cv/text_driven_segmentation/lseg_model.py +++ b/modelscope/models/cv/text_driven_segmentation/lseg_model.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os.path as osp from typing import Any, Dict diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_net.py b/modelscope/models/cv/text_driven_segmentation/lseg_net.py index 1a558c5c..541a4a38 100644 --- a/modelscope/models/cv/text_driven_segmentation/lseg_net.py +++ b/modelscope/models/cv/text_driven_segmentation/lseg_net.py @@ -1,7 +1,5 @@ -""" -Adapted from https://github.com/isl-org/lang-seg. -Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. -""" +# Adapted from https://github.com/isl-org/lang-seg. +# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. import numpy as np import torch diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py index be2813c2..5298832f 100644 --- a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py +++ b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py @@ -1,7 +1,5 @@ -""" -Adapted from https://github.com/isl-org/lang-seg. -Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. -""" +# Adapted from https://github.com/isl-org/lang-seg. +# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. import math import types diff --git a/modelscope/models/cv/text_driven_segmentation/model.py b/modelscope/models/cv/text_driven_segmentation/model.py index ece10bab..f98d480d 100644 --- a/modelscope/models/cv/text_driven_segmentation/model.py +++ b/modelscope/models/cv/text_driven_segmentation/model.py @@ -1,7 +1,5 @@ -""" -Adapted from https://github.com/isl-org/lang-seg. -Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. -""" +# Adapted from https://github.com/isl-org/lang-seg. +# Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. from collections import OrderedDict from typing import Tuple, Union diff --git a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py index 250d680f..361d67c6 100644 --- a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py +++ b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py @@ -1,7 +1,6 @@ -""" CLIP -Adapted from https://github.com/openai/CLIP. -Originally MIT License, Copyright (c) 2021 OpenAI. -""" +# CLIP +# Adapted from https://github.com/openai/CLIP. +# Originally MIT License, Copyright (c) 2021 OpenAI. import gzip import html diff --git a/modelscope/models/multi_modal/diffusion/diffusion.py b/modelscope/models/multi_modal/diffusion/diffusion.py index d71fe0ae..bfe7baf7 100644 --- a/modelscope/models/multi_modal/diffusion/diffusion.py +++ b/modelscope/models/multi_modal/diffusion/diffusion.py @@ -1,3 +1,6 @@ +# Part of the implementation is borrowed and modified from latent-diffusion, +# publicly avaialbe at https://github.com/CompVis/latent-diffusion. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math import torch diff --git a/modelscope/models/multi_modal/diffusion/model.py b/modelscope/models/multi_modal/diffusion/model.py index 8617b8dd..4229391f 100644 --- a/modelscope/models/multi_modal/diffusion/model.py +++ b/modelscope/models/multi_modal/diffusion/model.py @@ -1,3 +1,4 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import os.path as osp from typing import Any, Dict diff --git a/modelscope/models/multi_modal/diffusion/unet_generator.py b/modelscope/models/multi_modal/diffusion/unet_generator.py index 9b507223..539d3996 100644 --- a/modelscope/models/multi_modal/diffusion/unet_generator.py +++ b/modelscope/models/multi_modal/diffusion/unet_generator.py @@ -1,3 +1,6 @@ +# Part of the implementation is borrowed and modified from latent-diffusion, +# publicly avaialbe at https://github.com/CompVis/latent-diffusion. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math import torch diff --git a/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py index 1c66b2fe..38cff6a2 100644 --- a/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py +++ b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py @@ -1,3 +1,6 @@ +# Part of the implementation is borrowed and modified from latent-diffusion, +# publicly avaialbe at https://github.com/CompVis/latent-diffusion. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math import torch diff --git a/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py index 0da8b805..ca5cd7d6 100644 --- a/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py +++ b/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py @@ -1,3 +1,6 @@ +# Part of the implementation is borrowed and modified from latent-diffusion, +# publicly avaialbe at https://github.com/CompVis/latent-diffusion. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math from functools import partial diff --git a/modelscope/models/multi_modal/gemm/gemm_base.py b/modelscope/models/multi_modal/gemm/gemm_base.py index db928212..09ef2480 100644 --- a/modelscope/models/multi_modal/gemm/gemm_base.py +++ b/modelscope/models/multi_modal/gemm/gemm_base.py @@ -1,9 +1,14 @@ -""" Generative Multimodal Model -Base modules are adapted from https://github.com/openai/CLIP/, -originally MIT License, Copyright (c) 2021 OpenAI, -and adapted from https://github.com/lucidrains/CoCa-pytorch/, -originally MIT License, Copyright (c) 2022 Phil Wang. -""" +# Copyright 2021 The OpenAI Team Authors. +# Copyright 2022 Phil Wang. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. +# +# The implementation here is modified based on OpenAI CLIP, +# originally MIT License, Copyright (c) 2021 OpenAI, +# and publicly available at https://github.com/openai/CLIP/. +# The implementation here is modified based on Coca-pytorch, +# originally MIT License, Copyright (c) 2022 Phil Wang, +# and publicly available at https://github.com/lucidrains/CoCa-pytorch/, +""" Generative Multimodal Model Architecture.""" import os from collections import OrderedDict diff --git a/modelscope/models/multi_modal/gemm/gemm_model.py b/modelscope/models/multi_modal/gemm/gemm_model.py index 356dc8d3..55b211c0 100644 --- a/modelscope/models/multi_modal/gemm/gemm_model.py +++ b/modelscope/models/multi_modal/gemm/gemm_model.py @@ -1,3 +1,5 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. +""" Generative Multimodal Model Wrapper.""" import os.path as osp from typing import Any, Dict diff --git a/modelscope/models/multi_modal/gemm/tokenizer.py b/modelscope/models/multi_modal/gemm/tokenizer.py index af962ceb..8b7cc094 100644 --- a/modelscope/models/multi_modal/gemm/tokenizer.py +++ b/modelscope/models/multi_modal/gemm/tokenizer.py @@ -1,7 +1,11 @@ -""" CLIP Tokenizer -Adapted from https://github.com/openai/CLIP. -Originally MIT License, Copyright (c) 2021 OpenAI. -""" +# Copyright 2021 The OpenAI Team Authors. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. +# +# The implementation here is modified based on OpenAI CLIP, +# originally MIT License, Copyright (c) 2021 OpenAI, +# and publicly available at https://github.com/openai/CLIP/. +""" CLIP Tokenizer.""" + import gzip import html import os diff --git a/modelscope/models/multi_modal/mmr/__init__.py b/modelscope/models/multi_modal/mmr/__init__.py index c5fb7419..9dac8409 100644 --- a/modelscope/models/multi_modal/mmr/__init__.py +++ b/modelscope/models/multi_modal/mmr/__init__.py @@ -1 +1,3 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + from .models import VideoCLIPForMultiModalEmbedding diff --git a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py index eab1189f..c7ac3f94 100644 --- a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py +++ b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py @@ -1,3 +1,6 @@ +# The implementation is adopted from Huaishao Luo, +# made pubicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip + import cv2 import numpy as np import torch as th diff --git a/modelscope/models/multi_modal/mmr/models/__init__.py b/modelscope/models/multi_modal/mmr/models/__init__.py index 6cd06bcd..da832719 100644 --- a/modelscope/models/multi_modal/mmr/models/__init__.py +++ b/modelscope/models/multi_modal/mmr/models/__init__.py @@ -1 +1,3 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + from .clip_for_mm_video_embedding import VideoCLIPForMultiModalEmbedding diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py index 8d13e745..5e8e2e7a 100644 --- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py +++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py @@ -1,3 +1,6 @@ +# The implementation is adopated from the CLIP4Clip implementation, +# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip + import random from os.path import exists from typing import Any, Dict diff --git a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py index 572f44bc..253a847c 100644 --- a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py +++ b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py @@ -1,3 +1,6 @@ +# The implementation is adopated from the CLIP4Clip implementation, +# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip + import numpy as np diff --git a/modelscope/models/multi_modal/mmr/models/modeling.py b/modelscope/models/multi_modal/mmr/models/modeling.py index 21cc4c80..dc6510bf 100644 --- a/modelscope/models/multi_modal/mmr/models/modeling.py +++ b/modelscope/models/multi_modal/mmr/models/modeling.py @@ -1,3 +1,5 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + import os import platform from collections import OrderedDict diff --git a/modelscope/models/multi_modal/mmr/models/module_clip.py b/modelscope/models/multi_modal/mmr/models/module_clip.py index 36e56196..53501720 100644 --- a/modelscope/models/multi_modal/mmr/models/module_clip.py +++ b/modelscope/models/multi_modal/mmr/models/module_clip.py @@ -1,4 +1,5 @@ -# Part of the implementation is borrowed and modified from The OpenAI CLIP project. +# The implementation is adopated from the CLIP4Clip implementation, +# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip import hashlib import os diff --git a/modelscope/models/multi_modal/mmr/models/module_cross.py b/modelscope/models/multi_modal/mmr/models/module_cross.py index 05edb853..b958d5bc 100644 --- a/modelscope/models/multi_modal/mmr/models/module_cross.py +++ b/modelscope/models/multi_modal/mmr/models/module_cross.py @@ -1,3 +1,6 @@ +# The implementation is adopated from the CLIP4Clip implementation, +# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip + from __future__ import absolute_import, division, print_function import logging from collections import OrderedDict diff --git a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py index ee60f857..4e2c9b15 100644 --- a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py +++ b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py @@ -1,3 +1,6 @@ +# The implementation is adopated from the CLIP4Clip implementation, +# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip + import gzip import html import os diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/clip.py b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py index 54e971f7..98727066 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/clip.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/clip.py @@ -1,4 +1,5 @@ -# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP. +# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py index 17daedaf..eb52a48b 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/decoder.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py index a4fc52e0..9677d7c4 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/gaussian_diffusion.py @@ -1,5 +1,6 @@ -# The implementation here is modified based on latent diffusion, publicly available -# at https://github.com/CompVis/latent-diffusion. +# Part of the implementation is borrowed and modified from latent-diffusion, +# publicly avaialbe at https://github.com/CompVis/latent-diffusion. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/model.py b/modelscope/models/multi_modal/multi_stage_diffusion/model.py index c2d83b34..59bd837d 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/model.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/model.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math import os.path as osp diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/prior.py b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py index 380fa467..9f4ef2d5 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/prior.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/prior.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py index 6fd9bebe..59d6b304 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/tokenizer.py @@ -1,4 +1,5 @@ -# The implementation here is modified based on OpenAI CLIP, publicly available at https://github.com/openai/CLIP. +# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import gzip import html diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py index 4e99a514..a292edae 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/upsampler.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py index 8a0b3ff1..133da50b 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/xglm.py @@ -1,5 +1,6 @@ -# The implementation here is modified based on HuggingFace XGLM, publicly available -# at https://github.com/huggingface/transformers. +# Part of the implementation is borrowed and modified from HuggingFace XGLM, +# publicly avaialbe at https://github.com/huggingface/transformers. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math diff --git a/modelscope/models/multi_modal/team/team_model.py b/modelscope/models/multi_modal/team/team_model.py index 4aa77e17..8c0e288a 100644 --- a/modelscope/models/multi_modal/team/team_model.py +++ b/modelscope/models/multi_modal/team/team_model.py @@ -1,3 +1,4 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from typing import Any, Dict import cv2 diff --git a/modelscope/models/multi_modal/team/utils.py b/modelscope/models/multi_modal/team/utils.py index 3b3e394e..73919179 100644 --- a/modelscope/models/multi_modal/team/utils.py +++ b/modelscope/models/multi_modal/team/utils.py @@ -1,7 +1,10 @@ -""" Generative Multimodal Model -Base Transformer code is adapted from https://github.com/openai/CLIP/, -originally MIT License, Copyright (c) 2021 OpenAI, -""" +# Copyright 2021 The OpenAI Team Authors. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. +# +# The implementation here is modified based on OpenAI CLIP, +# originally MIT License, Copyright (c) 2021 OpenAI, +# and publicly available at https://github.com/openai/CLIP/. + from collections import OrderedDict from typing import Tuple, Union diff --git a/modelscope/pipelines/cv/animal_recognition_pipeline.py b/modelscope/pipelines/cv/animal_recognition_pipeline.py index 18cba92c..fad14680 100644 --- a/modelscope/pipelines/cv/animal_recognition_pipeline.py +++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py @@ -1,3 +1,4 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import os.path as osp from typing import Any, Dict diff --git a/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py b/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py index 9f4e2d93..deb17561 100644 --- a/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py +++ b/modelscope/pipelines/cv/cmdssl_video_embedding_pipeline.py @@ -1,3 +1,5 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + import os.path as osp from typing import Any, Dict diff --git a/modelscope/pipelines/cv/general_recognition_pipeline.py b/modelscope/pipelines/cv/general_recognition_pipeline.py index 9ba5117b..07222086 100644 --- a/modelscope/pipelines/cv/general_recognition_pipeline.py +++ b/modelscope/pipelines/cv/general_recognition_pipeline.py @@ -1,3 +1,4 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import os.path as osp from typing import Any, Dict diff --git a/modelscope/pipelines/cv/live_category_pipeline.py b/modelscope/pipelines/cv/live_category_pipeline.py index c16ba6ba..715998cc 100644 --- a/modelscope/pipelines/cv/live_category_pipeline.py +++ b/modelscope/pipelines/cv/live_category_pipeline.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import os.path as osp from typing import Any, Dict diff --git a/modelscope/pipelines/cv/shop_segmentation_pipleline.py b/modelscope/pipelines/cv/shop_segmentation_pipleline.py index b7fd90b4..d08058c3 100644 --- a/modelscope/pipelines/cv/shop_segmentation_pipleline.py +++ b/modelscope/pipelines/cv/shop_segmentation_pipleline.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict from modelscope.metainfo import Pipelines diff --git a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py index 0985b835..c7f9d4c2 100644 --- a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py +++ b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict from modelscope.metainfo import Pipelines diff --git a/modelscope/pipelines/cv/video_category_pipeline.py b/modelscope/pipelines/cv/video_category_pipeline.py index 196d3115..e4c73649 100644 --- a/modelscope/pipelines/cv/video_category_pipeline.py +++ b/modelscope/pipelines/cv/video_category_pipeline.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import os.path as osp from typing import Any, Dict diff --git a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py index d3b9fef3..13032314 100644 --- a/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py +++ b/modelscope/pipelines/multi_modal/generative_multi_modal_embedding_pipeline.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from typing import Any, Dict diff --git a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py index fc123e2f..cafd6555 100644 --- a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py +++ b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py @@ -1,5 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from typing import Any, Dict from modelscope.metainfo import Pipelines diff --git a/tests/pipelines/test_cmdssl_video_embedding.py b/tests/pipelines/test_cmdssl_video_embedding.py index 68eae385..5807c075 100644 --- a/tests/pipelines/test_cmdssl_video_embedding.py +++ b/tests/pipelines/test_cmdssl_video_embedding.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. # !/usr/bin/env python import unittest diff --git a/tests/pipelines/test_generative_multi_modal_embedding.py b/tests/pipelines/test_generative_multi_modal_embedding.py index 9232ebd4..7061d736 100644 --- a/tests/pipelines/test_generative_multi_modal_embedding.py +++ b/tests/pipelines/test_generative_multi_modal_embedding.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import unittest diff --git a/tests/pipelines/test_multi_modal_similarity.py b/tests/pipelines/test_multi_modal_similarity.py index 192602b4..a54fbcf0 100644 --- a/tests/pipelines/test_multi_modal_similarity.py +++ b/tests/pipelines/test_multi_modal_similarity.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import unittest