diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..9c607acc --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +*.png filter=lfs diff=lfs merge=lfs -text +*.jpg filter=lfs diff=lfs merge=lfs -text +*.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 3e6a3f4a..cc9ef477 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ wheels/ .installed.cfg *.egg /package +/temp MANIFEST # PyInstaller @@ -104,7 +105,6 @@ venv.bak/ # mypy .mypy_cache/ -data .vscode .idea @@ -124,3 +124,7 @@ replace.sh # Pytorch *.pth + + +# audio +*.wav diff --git a/data/test/images/image1.jpg b/data/test/images/image1.jpg new file mode 100644 index 00000000..450a969d --- /dev/null +++ b/data/test/images/image1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78094cc48fbcfd9b6d321fe13619ecc72b65e006fc1b4c4458409ade9979486d +size 129862 diff --git a/data/test/images/image_captioning.png b/data/test/images/image_captioning.png new file mode 100644 index 00000000..de3f1918 --- /dev/null +++ b/data/test/images/image_captioning.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141 +size 603621 diff --git a/data/test/images/image_matting.png b/data/test/images/image_matting.png new file mode 100644 index 00000000..de3f1918 --- /dev/null +++ b/data/test/images/image_matting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141 +size 603621 diff --git a/data/test/images/ocr_detection.jpg b/data/test/images/ocr_detection.jpg new file mode 100644 index 00000000..c347810e --- /dev/null +++ b/data/test/images/ocr_detection.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c8435db5583400be5d11a2c17910c96133b462c8a99ccaf0e19f4aac34e0a94 +size 141149 diff --git a/docs/source/develop.md b/docs/source/develop.md index f96590b0..96120088 100644 --- a/docs/source/develop.md +++ b/docs/source/develop.md @@ -91,6 +91,55 @@ make tests 4. Daily regression tests will run all cases at 0 am each day using master branch. +### 2.3 Test data storage + +As we need a lot of data for testing, including images, videos, models. We use git lfs +to store those large files. + +1. install git-lfs +for mac +```bash +brew install git-lfs +git lfs install +``` + +for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0) +```bash +wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm +sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm +git lfs install +``` + +for ubuntu +```bash +curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash +sudo apt-get install git-lfs +git lfs install +``` + +2. track your data type using git lfs, for example, to track png files +```bash +git lfs track "*.png" +``` + +3. add your test files to `data/test/` folder, you can make directories if you need. +```bash +git add data/test/test.png +``` + +4. commit your test data to remote branch +```bash +git commit -m "xxx" +``` + +To pull data from remote repo, just as the same way you pull git files. +```bash +git pull origin branch_name +``` + + + + ## Code Review 1. Run following command to create an aone CR, replace `TARGET_BRANCH` and `CR_NAME` with the one you want. diff --git a/docs/source/faq.md b/docs/source/faq.md index a93fafdc..6ed3b305 100644 --- a/docs/source/faq.md +++ b/docs/source/faq.md @@ -29,3 +29,15 @@ reference: [https://huggingface.co/docs/tokenizers/installation#installation-fro > ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. 由于依赖库之间的版本不兼容,可能会存在版本冲突的情况,大部分情况下不影响正常运行。 + +### 3. 安装pytorch出现版本错误 + +> ERROR: Ignored the following versions that require a different python version: 1.1.0 Requires-Python >=3.8; 1.1.0rc1 Requires-Python >=3.8; 1.1.1 Requires-Python >=3.8 +> ERROR: Could not find a version that satisfies the requirement torch==1.8.1+cu111 (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0) +> ERROR: No matching distribution found for torch==1.8.1+cu111 + +安装时使用如下命令: + +```shell +pip install -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt +``` diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 0f4cbbc3..7148f27f 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -25,6 +25,10 @@ ModelScope Library目前支持tensorflow,pytorch两大深度学习框架进行 * [Pytorch安装指导](https://pytorch.org/get-started/locally/) * [Tensorflow安装指导](https://www.tensorflow.org/install/pip) +部分第三方依赖库需要提前安装numpy +``` +pip install numpy +``` ## ModelScope library 安装 diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py index d9a89d35..f873dcca 100644 --- a/modelscope/models/__init__.py +++ b/modelscope/models/__init__.py @@ -1,5 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from .audio.tts.am import SambertNetHifi16k +from .audio.tts.vocoder import Hifigan16k from .base import Model from .builder import MODELS, build_model +from .multi_model import OfaForImageCaptioning from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity diff --git a/modelscope/models/audio/__init__.py b/modelscope/models/audio/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/audio/layers/__init__.py b/modelscope/models/audio/layers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/audio/layers/activations.py b/modelscope/models/audio/layers/activations.py new file mode 100644 index 00000000..b0215bcc --- /dev/null +++ b/modelscope/models/audio/layers/activations.py @@ -0,0 +1,60 @@ +import torch.nn as nn + +from .layer_base import LayerBase + + +class RectifiedLinear(LayerBase): + + def __init__(self, input_dim, output_dim): + super(RectifiedLinear, self).__init__() + self.dim = input_dim + self.relu = nn.ReLU() + + def forward(self, input): + return self.relu(input) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.dim, self.dim) + return re_str + + def load_kaldi_nnet(self, instr): + return instr + + +class LogSoftmax(LayerBase): + + def __init__(self, input_dim, output_dim): + super(LogSoftmax, self).__init__() + self.dim = input_dim + self.ls = nn.LogSoftmax() + + def forward(self, input): + return self.ls(input) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.dim, self.dim) + return re_str + + def load_kaldi_nnet(self, instr): + return instr + + +class Sigmoid(LayerBase): + + def __init__(self, input_dim, output_dim): + super(Sigmoid, self).__init__() + self.dim = input_dim + self.sig = nn.Sigmoid() + + def forward(self, input): + return self.sig(input) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.dim, self.dim) + return re_str + + def load_kaldi_nnet(self, instr): + return instr diff --git a/modelscope/models/audio/layers/affine_transform.py b/modelscope/models/audio/layers/affine_transform.py new file mode 100644 index 00000000..33479505 --- /dev/null +++ b/modelscope/models/audio/layers/affine_transform.py @@ -0,0 +1,78 @@ +import numpy as np +import torch as th +import torch.nn as nn + +from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number, + to_kaldi_matrix) + + +class AffineTransform(LayerBase): + + def __init__(self, input_dim, output_dim): + super(AffineTransform, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.linear = nn.Linear(input_dim, output_dim) + + def forward(self, input): + return self.linear(input) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.output_dim, + self.input_dim) + re_str += ' 1 1 0\n' + linear_weights = self.state_dict()['linear.weight'] + x = linear_weights.squeeze().numpy() + re_str += to_kaldi_matrix(x) + linear_bias = self.state_dict()['linear.bias'] + x = linear_bias.squeeze().numpy() + re_str += to_kaldi_matrix(x) + return re_str + + def to_raw_nnet(self, fid): + linear_weights = self.state_dict()['linear.weight'] + x = linear_weights.squeeze().numpy() + x.tofile(fid) + + linear_bias = self.state_dict()['linear.bias'] + x = linear_bias.squeeze().numpy() + x.tofile(fid) + + def load_kaldi_nnet(self, instr): + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('AffineTransform format error for ') + instr, lr = output + + output = expect_token_number(instr, '') + if output is None: + raise Exception( + 'AffineTransform format error for ') + instr, lr = output + + output = expect_token_number(instr, '') + if output is None: + raise Exception('AffineTransform format error for ') + instr, lr = output + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('AffineTransform format error for parsing matrix') + instr, mat = output + + print(mat.shape) + self.linear.weight = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('AffineTransform format error for parsing matrix') + instr, mat = output + mat = np.squeeze(mat) + self.linear.bias = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + return instr diff --git a/modelscope/models/audio/layers/deep_fsmn.py b/modelscope/models/audio/layers/deep_fsmn.py new file mode 100644 index 00000000..72ba07dc --- /dev/null +++ b/modelscope/models/audio/layers/deep_fsmn.py @@ -0,0 +1,178 @@ +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F + +from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number, + to_kaldi_matrix) + + +class DeepFsmn(LayerBase): + + def __init__(self, + input_dim, + output_dim, + lorder=None, + rorder=None, + hidden_size=None, + layer_norm=False, + dropout=0): + super(DeepFsmn, self).__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + + if lorder is None: + return + + self.lorder = lorder + self.rorder = rorder + self.hidden_size = hidden_size + self.layer_norm = layer_norm + + self.linear = nn.Linear(input_dim, hidden_size) + self.norm = nn.LayerNorm(hidden_size) + self.drop1 = nn.Dropout(p=dropout) + self.drop2 = nn.Dropout(p=dropout) + self.project = nn.Linear(hidden_size, output_dim, bias=False) + + self.conv1 = nn.Conv2d( + output_dim, + output_dim, [lorder, 1], [1, 1], + groups=output_dim, + bias=False) + self.conv2 = nn.Conv2d( + output_dim, + output_dim, [rorder, 1], [1, 1], + groups=output_dim, + bias=False) + + def forward(self, input): + + f1 = F.relu(self.linear(input)) + + f1 = self.drop1(f1) + if self.layer_norm: + f1 = self.norm(f1) + + p1 = self.project(f1) + + x = th.unsqueeze(p1, 1) + + x_per = x.permute(0, 3, 2, 1) + + y = F.pad(x_per, [0, 0, self.lorder - 1, 0]) + yr = F.pad(x_per, [0, 0, 0, self.rorder]) + yr = yr[:, :, 1:, :] + + out = x_per + self.conv1(y) + self.conv2(yr) + out = self.drop2(out) + + out1 = out.permute(0, 3, 2, 1) + + return input + out1.squeeze() + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n'\ + % (self.output_dim, self.input_dim) + re_str += ' %d %d %d %d 0\n'\ + % (1, self.hidden_size, self.lorder, 1) + lfiters = self.state_dict()['conv1.weight'] + x = np.flipud(lfiters.squeeze().numpy().T) + re_str += to_kaldi_matrix(x) + proj_weights = self.state_dict()['project.weight'] + x = proj_weights.squeeze().numpy() + re_str += to_kaldi_matrix(x) + linear_weights = self.state_dict()['linear.weight'] + x = linear_weights.squeeze().numpy() + re_str += to_kaldi_matrix(x) + linear_bias = self.state_dict()['linear.bias'] + x = linear_bias.squeeze().numpy() + re_str += to_kaldi_matrix(x) + return re_str + + def load_kaldi_nnet(self, instr): + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + instr, lr = output + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + instr, hiddensize = output + self.hidden_size = int(hiddensize) + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + instr, lorder = output + self.lorder = int(lorder) + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + instr, lstride = output + self.lstride = lstride + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error for parsing matrix') + instr, mat = output + mat1 = np.fliplr(mat.T).copy() + self.conv1 = nn.Conv2d( + self.output_dim, + self.output_dim, [self.lorder, 1], [1, 1], + groups=self.output_dim, + bias=False) + mat_th = th.from_numpy(mat1).type(th.FloatTensor) + mat_th = mat_th.unsqueeze(1) + mat_th = mat_th.unsqueeze(3) + self.conv1.weight = th.nn.Parameter(mat_th) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error for parsing matrix') + instr, mat = output + + self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False) + self.linear = nn.Linear(self.input_dim, self.hidden_size) + + self.project.weight = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error for parsing matrix') + instr, mat = output + self.linear.weight = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error for parsing matrix') + instr, mat = output + self.linear.bias = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + + return instr diff --git a/modelscope/models/audio/layers/layer_base.py b/modelscope/models/audio/layers/layer_base.py new file mode 100644 index 00000000..e56c4bc0 --- /dev/null +++ b/modelscope/models/audio/layers/layer_base.py @@ -0,0 +1,50 @@ +import abc +import re + +import numpy as np +import torch.nn as nn + + +def expect_token_number(instr, token): + first_token = re.match(r'^\s*' + token, instr) + if first_token is None: + return None + instr = instr[first_token.end():] + lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr) + if lr is None: + return None + return instr[lr.end():], lr.groups()[0] + + +def expect_kaldi_matrix(instr): + pos2 = instr.find('[', 0) + pos3 = instr.find(']', pos2) + mat = [] + for stt in instr[pos2 + 1:pos3].split('\n'): + tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ') + if tmp_mat.size > 0: + mat.append(tmp_mat) + return instr[pos3 + 1:], np.array(mat) + + +def to_kaldi_matrix(np_mat): + """ + function that transform as str numpy mat to standard kaldi str matrix + :param np_mat: numpy mat + :return: str + """ + np.set_printoptions(threshold=np.inf, linewidth=np.nan, suppress=True) + out_str = str(np_mat) + out_str = out_str.replace('[', '') + out_str = out_str.replace(']', '') + return '[ %s ]\n' % out_str + + +class LayerBase(nn.Module, metaclass=abc.ABCMeta): + + def __init__(self): + super(LayerBase, self).__init__() + + @abc.abstractmethod + def to_kaldi_nnet(self): + pass diff --git a/modelscope/models/audio/layers/uni_deep_fsmn.py b/modelscope/models/audio/layers/uni_deep_fsmn.py new file mode 100644 index 00000000..c22460c4 --- /dev/null +++ b/modelscope/models/audio/layers/uni_deep_fsmn.py @@ -0,0 +1,482 @@ +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F + +from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number, + to_kaldi_matrix) + + +class SepConv(nn.Module): + + def __init__(self, + in_channels, + filters, + out_channels, + kernel_size=(5, 2), + dilation=(1, 1)): + """ :param kernel_size (time, frequency) + + """ + super(SepConv, self).__init__() + # depthwise + pointwise + self.dconv = nn.Conv2d( + in_channels, + in_channels * filters, + kernel_size, + dilation=dilation, + groups=in_channels) + self.pconv = nn.Conv2d( + in_channels * filters, out_channels, kernel_size=1) + self.padding = dilation[0] * (kernel_size[0] - 1) + + def forward(self, input): + ''' input: [B, C, T, F] + ''' + x = F.pad(input, [0, 0, self.padding, 0]) + x = self.dconv(x) + x = self.pconv(x) + return x + + +class Conv2d(nn.Module): + + def __init__(self, + input_dim, + output_dim, + lorder=20, + rorder=0, + groups=1, + bias=False, + skip_connect=True): + super(Conv2d, self).__init__() + self.lorder = lorder + self.conv = nn.Conv2d( + input_dim, output_dim, [lorder, 1], groups=groups, bias=bias) + self.rorder = rorder + if self.rorder: + self.conv2 = nn.Conv2d( + input_dim, output_dim, [rorder, 1], groups=groups, bias=bias) + self.skip_connect = skip_connect + + def forward(self, input): + # [B, 1, T, F] + x = th.unsqueeze(input, 1) + # [B, F, T, 1] + x_per = x.permute(0, 3, 2, 1) + y = F.pad(x_per, [0, 0, self.lorder - 1, 0]) + out = self.conv(y) + if self.rorder: + yr = F.pad(x_per, [0, 0, 0, self.rorder]) + yr = yr[:, :, 1:, :] + out += self.conv2(yr) + out = out.permute(0, 3, 2, 1).squeeze(1) + if self.skip_connect: + out = out + input + return out + + +class SelfAttLayer(nn.Module): + + def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None): + super(SelfAttLayer, self).__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + + if lorder is None: + return + + self.lorder = lorder + self.hidden_size = hidden_size + + self.linear = nn.Linear(input_dim, hidden_size) + + self.project = nn.Linear(hidden_size, output_dim, bias=False) + + self.att = nn.Linear(input_dim, lorder, bias=False) + + def forward(self, input): + + f1 = F.relu(self.linear(input)) + + p1 = self.project(f1) + + x = th.unsqueeze(p1, 1) + + x_per = x.permute(0, 3, 2, 1) + + y = F.pad(x_per, [0, 0, self.lorder - 1, 0]) + + # z [B, F, T, lorder] + z = x_per + for i in range(1, self.lorder): + z = th.cat([z, y[:, :, self.lorder - 1 - i:-i, :]], axis=-1) + + # [B, T, lorder] + att = F.softmax(self.att(input), dim=-1) + att = th.unsqueeze(att, 1) + z = th.sum(z * att, axis=-1) + + out1 = z.permute(0, 2, 1) + + return input + out1 + + +class TFFsmn(nn.Module): + + def __init__(self, + input_dim, + output_dim, + lorder=None, + hidden_size=None, + dilation=1, + layer_norm=False, + dropout=0, + skip_connect=True): + super(TFFsmn, self).__init__() + + self.skip_connect = skip_connect + + self.linear = nn.Linear(input_dim, hidden_size) + self.norm = nn.Identity() + if layer_norm: + self.norm = nn.LayerNorm(input_dim) + self.act = nn.ReLU() + self.project = nn.Linear(hidden_size, output_dim, bias=False) + + self.conv1 = nn.Conv2d( + output_dim, + output_dim, [lorder, 1], + dilation=[dilation, 1], + groups=output_dim, + bias=False) + self.padding_left = dilation * (lorder - 1) + dorder = 5 + self.conv2 = nn.Conv2d(1, 1, [dorder, 1], bias=False) + self.padding_freq = dorder - 1 + + def forward(self, input): + return self.compute1(input) + + def compute1(self, input): + ''' linear-dconv-relu(norm)-linear-dconv + ''' + x = self.linear(input) + # [B, 1, F, T] + x = th.unsqueeze(x, 1).permute(0, 1, 3, 2) + z = F.pad(x, [0, 0, self.padding_freq, 0]) + z = self.conv2(z) + x + x = z.permute(0, 3, 2, 1).squeeze(-1) + x = self.act(x) + x = self.norm(x) + x = self.project(x) + x = th.unsqueeze(x, 1).permute(0, 3, 2, 1) + # [B, F, T+lorder-1, 1] + y = F.pad(x, [0, 0, self.padding_left, 0]) + out = self.conv1(y) + if self.skip_connect: + out = out + x + out = out.permute(0, 3, 2, 1).squeeze() + + return input + out + + +class CNNFsmn(nn.Module): + ''' use cnn to reduce parameters + ''' + + def __init__(self, + input_dim, + output_dim, + lorder=None, + hidden_size=None, + dilation=1, + layer_norm=False, + dropout=0, + skip_connect=True): + super(CNNFsmn, self).__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + self.skip_connect = skip_connect + + if lorder is None: + return + + self.lorder = lorder + self.hidden_size = hidden_size + + self.linear = nn.Linear(input_dim, hidden_size) + self.act = nn.ReLU() + kernel_size = (3, 8) + stride = (1, 4) + self.conv = nn.Sequential( + nn.ConstantPad2d((stride[1], 0, kernel_size[0] - 1, 0), 0), + nn.Conv2d(1, stride[1], kernel_size=kernel_size, stride=stride)) + + self.dconv = nn.Conv2d( + output_dim, + output_dim, [lorder, 1], + dilation=[dilation, 1], + groups=output_dim, + bias=False) + self.padding_left = dilation * (lorder - 1) + + def forward(self, input): + return self.compute2(input) + + def compute1(self, input): + ''' linear-relu(norm)-conv2d-relu?-dconv + ''' + # [B, T, F] + x = self.linear(input) + x = self.act(x) + x = th.unsqueeze(x, 1) + x = self.conv(x) + # [B, C, T, F] -> [B, 1, T, F] + b, c, t, f = x.shape + x = x.view([b, 1, t, -1]) + x = x.permute(0, 3, 2, 1) + # [B, F, T+lorder-1, 1] + y = F.pad(x, [0, 0, self.padding_left, 0]) + out = self.dconv(y) + if self.skip_connect: + out = out + x + out = out.permute(0, 3, 2, 1).squeeze() + return input + out + + def compute2(self, input): + ''' conv2d-relu-linear-relu?-dconv + ''' + x = th.unsqueeze(input, 1) + x = self.conv(x) + x = self.act(x) + # [B, C, T, F] -> [B, T, F] + b, c, t, f = x.shape + x = x.view([b, t, -1]) + x = self.linear(x) + x = th.unsqueeze(x, 1).permute(0, 3, 2, 1) + y = F.pad(x, [0, 0, self.padding_left, 0]) + out = self.dconv(y) + if self.skip_connect: + out = out + x + out = out.permute(0, 3, 2, 1).squeeze() + return input + out + + +class UniDeepFsmn(LayerBase): + + def __init__(self, + input_dim, + output_dim, + lorder=None, + hidden_size=None, + dilation=1, + layer_norm=False, + dropout=0, + skip_connect=True): + super(UniDeepFsmn, self).__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + self.skip_connect = skip_connect + + if lorder is None: + return + + self.lorder = lorder + self.hidden_size = hidden_size + + self.linear = nn.Linear(input_dim, hidden_size) + self.norm = nn.Identity() + if layer_norm: + self.norm = nn.LayerNorm(input_dim) + self.act = nn.ReLU() + self.project = nn.Linear(hidden_size, output_dim, bias=False) + + self.conv1 = nn.Conv2d( + output_dim, + output_dim, [lorder, 1], + dilation=[dilation, 1], + groups=output_dim, + bias=False) + self.padding_left = dilation * (lorder - 1) + + def forward(self, input): + return self.compute1(input) + + def compute1(self, input): + ''' linear-relu(norm)-linear-dconv + ''' + # [B, T, F] + x = self.linear(input) + x = self.act(x) + x = self.norm(x) + x = self.project(x) + x = th.unsqueeze(x, 1).permute(0, 3, 2, 1) + # [B, F, T+lorder-1, 1] + y = F.pad(x, [0, 0, self.padding_left, 0]) + out = self.conv1(y) + if self.skip_connect: + out = out + x + out = out.permute(0, 3, 2, 1).squeeze() + + return input + out + + def compute2(self, input): + ''' linear-dconv-linear-relu(norm) + ''' + x = self.project(input) + x = th.unsqueeze(x, 1).permute(0, 3, 2, 1) + y = F.pad(x, [0, 0, self.padding_left, 0]) + out = self.conv1(y) + if self.skip_connect: + out = out + x + out = out.permute(0, 3, 2, 1).squeeze() + x = self.linear(out) + x = self.act(x) + x = self.norm(x) + + return input + x + + def compute3(self, input): + ''' dconv-linear-relu(norm)-linear + ''' + x = th.unsqueeze(input, 1).permute(0, 3, 2, 1) + y = F.pad(x, [0, 0, self.padding_left, 0]) + out = self.conv1(y) + if self.skip_connect: + out = out + x + out = out.permute(0, 3, 2, 1).squeeze() + x = self.linear(out) + x = self.act(x) + x = self.norm(x) + x = self.project(x) + + return input + x + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' \ + % (self.output_dim, self.input_dim) + re_str += ' %d %d %d %d 0\n' \ + % (1, self.hidden_size, self.lorder, 1) + lfiters = self.state_dict()['conv1.weight'] + x = np.flipud(lfiters.squeeze().numpy().T) + re_str += to_kaldi_matrix(x) + proj_weights = self.state_dict()['project.weight'] + x = proj_weights.squeeze().numpy() + re_str += to_kaldi_matrix(x) + linear_weights = self.state_dict()['linear.weight'] + x = linear_weights.squeeze().numpy() + re_str += to_kaldi_matrix(x) + linear_bias = self.state_dict()['linear.bias'] + x = linear_bias.squeeze().numpy() + re_str += to_kaldi_matrix(x) + return re_str + + def to_raw_nnet(self, fid): + lfiters = self.state_dict()['conv1.weight'] + x = np.flipud(lfiters.squeeze().numpy().T) + x.tofile(fid) + + proj_weights = self.state_dict()['project.weight'] + x = proj_weights.squeeze().numpy() + x.tofile(fid) + + linear_weights = self.state_dict()['linear.weight'] + x = linear_weights.squeeze().numpy() + x.tofile(fid) + + linear_bias = self.state_dict()['linear.bias'] + x = linear_bias.squeeze().numpy() + x.tofile(fid) + + def load_kaldi_nnet(self, instr): + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + instr, lr = output + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + instr, hiddensize = output + self.hidden_size = int(hiddensize) + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + instr, lorder = output + self.lorder = int(lorder) + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + instr, lstride = output + self.lstride = lstride + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error for ') + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error for parsing matrix') + instr, mat = output + mat1 = np.fliplr(mat.T).copy() + + self.conv1 = nn.Conv2d( + self.output_dim, + self.output_dim, [self.lorder, 1], [1, 1], + groups=self.output_dim, + bias=False) + + mat_th = th.from_numpy(mat1).type(th.FloatTensor) + mat_th = mat_th.unsqueeze(1) + mat_th = mat_th.unsqueeze(3) + self.conv1.weight = th.nn.Parameter(mat_th) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error for parsing matrix') + instr, mat = output + + self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False) + self.linear = nn.Linear(self.input_dim, self.hidden_size) + + self.project.weight = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error for parsing matrix') + instr, mat = output + self.linear.weight = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error for parsing matrix') + instr, mat = output + mat = np.squeeze(mat) + self.linear.bias = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + + return instr diff --git a/modelscope/models/audio/network/__init__.py b/modelscope/models/audio/network/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/audio/network/loss.py b/modelscope/models/audio/network/loss.py new file mode 100644 index 00000000..743661b3 --- /dev/null +++ b/modelscope/models/audio/network/loss.py @@ -0,0 +1,394 @@ +import torch +import torch.nn.functional as F + +from .modulation_loss import (GaborSTRFConv, MelScale, + ModulationDomainLossModule) + +EPS = 1e-8 + + +def compute_mask(mixed_spec, clean_spec, mask_type='psmiam', clip=1): + ''' + stft: (batch, ..., 2) or complex(batch, ...) + y = x + n + ''' + if torch.is_complex(mixed_spec): + yr, yi = mixed_spec.real, mixed_spec.imag + else: + yr, yi = mixed_spec[..., 0], mixed_spec[..., 1] + if torch.is_complex(clean_spec): + xr, xi = clean_spec.real, clean_spec.imag + else: + xr, xi = clean_spec[..., 0], clean_spec[..., 1] + + if mask_type == 'iam': + ymag = torch.sqrt(yr**2 + yi**2) + xmag = torch.sqrt(xr**2 + xi**2) + iam = xmag / (ymag + EPS) + return torch.clamp(iam, 0, 1) + + elif mask_type == 'psm': + ypow = yr**2 + yi**2 + psm = (xr * yr + xi * yi) / (ypow + EPS) + return torch.clamp(psm, 0, 1) + + elif mask_type == 'psmiam': + ypow = yr**2 + yi**2 + psm = (xr * yr + xi * yi) / (ypow + EPS) + ymag = torch.sqrt(yr**2 + yi**2) + xmag = torch.sqrt(xr**2 + xi**2) + iam = xmag / (ymag + EPS) + psmiam = psm * iam + return torch.clamp(psmiam, 0, 1) + + elif mask_type == 'crm': + ypow = yr**2 + yi**2 + mr = (xr * yr + xi * yi) / (ypow + EPS) + mi = (xi * yr - xr * yi) / (ypow + EPS) + mr = torch.clamp(mr, -clip, clip) + mi = torch.clamp(mi, -clip, clip) + return mr, mi + + +def energy_vad(spec, + thdhigh=320 * 600 * 600 * 2, + thdlow=320 * 300 * 300 * 2, + int16=True): + ''' + energy based vad should be accurate enough + spec: (batch, bins, frames, 2) + returns (batch, frames) + ''' + energy = torch.sum(spec[..., 0]**2 + spec[..., 1]**2, dim=1) + vad = energy > thdhigh + idx = torch.logical_and(vad == 0, energy > thdlow) + vad[idx] = 0.5 + return vad + + +def modulation_loss_init(n_fft): + gabor_strf_parameters = torch.load( + './network/gabor_strf_parameters.pt')['state_dict'] + gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60) + gabor_modulation_kernels.load_state_dict(gabor_strf_parameters) + + modulation_loss_module = ModulationDomainLossModule( + gabor_modulation_kernels.eval()) + for param in modulation_loss_module.parameters(): + param.requires_grad = False + + stft2mel = MelScale( + n_mels=80, sample_rate=16000, n_stft=n_fft // 2 + 1).cuda() + + return modulation_loss_module, stft2mel + + +def mask_loss_function( + loss_func='psm_loss', + loss_type='mse', # ['mse', 'mae', 'comb'] + mask_type='psmiam', + use_mod_loss=False, + use_wav2vec_loss=False, + n_fft=640, + hop_length=320, + EPS=1e-8, + weight=None): + if weight is not None: + print(f'Use loss weight: {weight}') + winlen = n_fft + window = torch.hamming_window(winlen, periodic=False) + + def stft(x, return_complex=False): + # returns [batch, bins, frames, 2] + return torch.stft( + x, + n_fft, + hop_length, + winlen, + window=window.to(x.device), + center=False, + return_complex=return_complex) + + def istft(x, slen): + return torch.istft( + x, + n_fft, + hop_length, + winlen, + window=window.to(x.device), + center=False, + length=slen) + + def mask_loss(targets, masks, nframes): + ''' [Batch, Time, Frequency] + ''' + with torch.no_grad(): + mask_for_loss = torch.ones_like(targets) + for idx, num in enumerate(nframes): + mask_for_loss[idx, num:, :] = 0 + masks = masks * mask_for_loss + targets = targets * mask_for_loss + + if weight is None: + alpha = 1 + else: # for aec ST + alpha = weight - targets + + if loss_type == 'mse': + loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2)) + elif loss_type == 'mae': + loss = torch.sum(alpha * torch.abs(targets - masks)) + else: # mse(mask), mae(mask) approx 1:2 + loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2) + + 0.1 * alpha * torch.abs(targets - masks)) + loss /= torch.sum(nframes) + return loss + + def spectrum_loss(targets, spec, nframes): + ''' [Batch, Time, Frequency, 2] + ''' + with torch.no_grad(): + mask_for_loss = torch.ones_like(targets[..., 0]) + for idx, num in enumerate(nframes): + mask_for_loss[idx, num:, :] = 0 + xr = spec[..., 0] * mask_for_loss + xi = spec[..., 1] * mask_for_loss + yr = targets[..., 0] * mask_for_loss + yi = targets[..., 1] * mask_for_loss + xmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) * mask_for_loss + ymag = torch.sqrt(targets[..., 0]**2 + + targets[..., 1]**2) * mask_for_loss + + loss1 = torch.sum(torch.pow(xr - yr, 2) + torch.pow(xi - yi, 2)) + loss2 = torch.sum(torch.pow(xmag - ymag, 2)) + + loss = (loss1 + loss2) / torch.sum(nframes) + return loss + + def sa_loss_dlen(mixed, clean, masks, nframes): + yspec = stft(mixed).permute([0, 2, 1, 3]) / 32768 + xspec = stft(clean).permute([0, 2, 1, 3]) / 32768 + with torch.no_grad(): + mask_for_loss = torch.ones_like(xspec[..., 0]) + for idx, num in enumerate(nframes): + mask_for_loss[idx, num:, :] = 0 + emag = ((yspec[..., 0]**2 + yspec[..., 1]**2)**0.15) * (masks**0.3) + xmag = (xspec[..., 0]**2 + xspec[..., 1]**2)**0.15 + emag = emag * mask_for_loss + xmag = xmag * mask_for_loss + + loss = torch.sum(torch.pow(emag - xmag, 2)) / torch.sum(nframes) + return loss + + def psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask=None): + mixed_spec = stft(mixed) + clean_spec = stft(clean) + targets = compute_mask(mixed_spec, clean_spec, mask_type) + # [B, T, F] + targets = targets.permute(0, 2, 1) + + loss = mask_loss(targets, masks, nframes) + + if subtask is not None: + vadtargets = energy_vad(clean_spec) + with torch.no_grad(): + mask_for_loss = torch.ones_like(targets[:, :, 0]) + for idx, num in enumerate(nframes): + mask_for_loss[idx, num:] = 0 + subtask = subtask[:, :, 0] * mask_for_loss + vadtargets = vadtargets * mask_for_loss + + loss_vad = F.binary_cross_entropy(subtask, vadtargets) + return loss + loss_vad + return loss + + def modulation_loss(mixed, clean, masks, nframes, subtask=None): + mixed_spec = stft(mixed, True) + clean_spec = stft(clean, True) + enhanced_mag = torch.abs(mixed_spec) + clean_mag = torch.abs(clean_spec) + with torch.no_grad(): + mask_for_loss = torch.ones_like(clean_mag) + for idx, num in enumerate(nframes): + mask_for_loss[idx, :, num:] = 0 + clean_mag = clean_mag * mask_for_loss + enhanced_mag = enhanced_mag * mask_for_loss * masks.permute([0, 2, 1]) + + # Covert to log-mel representation + # (B,T,#mel_channels) + clean_log_mel = torch.log( + torch.transpose(stft2mel(clean_mag**2), 2, 1) + 1e-8) + enhanced_log_mel = torch.log( + torch.transpose(stft2mel(enhanced_mag**2), 2, 1) + 1e-8) + + alpha = compute_mask(mixed_spec, clean_spec, mask_type) + alpha = alpha.permute(0, 2, 1) + loss = 0.05 * modulation_loss_module(enhanced_log_mel, clean_log_mel, + alpha) + loss2 = psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask) + # print(loss.item(), loss2.item()) #approx 1:4 + loss = loss + loss2 + return loss + + def wav2vec_loss(mixed, clean, masks, nframes, subtask=None): + mixed /= 32768 + clean /= 32768 + mixed_spec = stft(mixed) + with torch.no_grad(): + mask_for_loss = torch.ones_like(masks) + for idx, num in enumerate(nframes): + mask_for_loss[idx, num:, :] = 0 + masks_est = masks * mask_for_loss + + estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3) + est_clean = istft(estimate, clean.shape[1]) + loss = wav2vec_loss_module(est_clean, clean) + return loss + + def sisdr_loss_dlen(mixed, + clean, + masks, + nframes, + subtask=None, + zero_mean=True): + mixed_spec = stft(mixed) + with torch.no_grad(): + mask_for_loss = torch.ones_like(masks) + for idx, num in enumerate(nframes): + mask_for_loss[idx, num:, :] = 0 + masks_est = masks * mask_for_loss + + estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3) + est_clean = istft(estimate, clean.shape[1]) + flen = min(clean.shape[1], est_clean.shape[1]) + clean = clean[:, :flen] + est_clean = est_clean[:, :flen] + + # follow asteroid/losses/sdr.py + if zero_mean: + clean = clean - torch.mean(clean, dim=1, keepdim=True) + est_clean = est_clean - torch.mean(est_clean, dim=1, keepdim=True) + + dot = torch.sum(est_clean * clean, dim=1, keepdim=True) + s_clean_energy = torch.sum(clean**2, dim=1, keepdim=True) + EPS + scaled_clean = dot * clean / s_clean_energy + e_noise = est_clean - scaled_clean + + # [batch] + sisdr = torch.sum( + scaled_clean**2, dim=1) / ( + torch.sum(e_noise**2, dim=1) + EPS) + sisdr = -10 * torch.log10(sisdr + EPS) + loss = sisdr.mean() + return loss + + def sisdr_freq_loss_dlen(mixed, clean, masks, nframes, subtask=None): + mixed_spec = stft(mixed) + clean_spec = stft(clean) + with torch.no_grad(): + mask_for_loss = torch.ones_like(masks) + for idx, num in enumerate(nframes): + mask_for_loss[idx, num:, :] = 0 + masks_est = masks * mask_for_loss + + estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3) + + dot_real = estimate[..., 0] * clean_spec[..., 0] + \ + estimate[..., 1] * clean_spec[..., 1] + dot_imag = estimate[..., 0] * clean_spec[..., 1] - \ + estimate[..., 1] * clean_spec[..., 0] + dot = torch.cat([dot_real.unsqueeze(3), dot_imag.unsqueeze(3)], dim=-1) + s_clean_energy = clean_spec[..., 0] ** 2 + \ + clean_spec[..., 1] ** 2 + EPS + scaled_clean = dot * clean_spec / s_clean_energy.unsqueeze(3) + e_noise = estimate - scaled_clean + + # [batch] + scaled_clean_energy = torch.sum( + scaled_clean[..., 0]**2 + scaled_clean[..., 1]**2, dim=1) + e_noise_energy = torch.sum( + e_noise[..., 0]**2 + e_noise[..., 1]**2, dim=1) + sisdr = torch.sum( + scaled_clean_energy, dim=1) / ( + torch.sum(e_noise_energy, dim=1) + EPS) + sisdr = -10 * torch.log10(sisdr + EPS) + loss = sisdr.mean() + return loss + + def crm_loss_dlen(mixed, clean, masks, nframes, subtask=None): + mixed_spec = stft(mixed).permute([0, 2, 1, 3]) + clean_spec = stft(clean).permute([0, 2, 1, 3]) + mixed_spec = mixed_spec / 32768 + clean_spec = clean_spec / 32768 + tgt_mr, tgt_mi = compute_mask(mixed_spec, clean_spec, mask_type='crm') + + D = int(masks.shape[2] / 2) + with torch.no_grad(): + mask_for_loss = torch.ones_like(clean_spec[..., 0]) + for idx, num in enumerate(nframes): + mask_for_loss[idx, num:, :] = 0 + mr = masks[..., :D] * mask_for_loss + mi = masks[..., D:] * mask_for_loss + tgt_mr = tgt_mr * mask_for_loss + tgt_mi = tgt_mi * mask_for_loss + + if weight is None: + alpha = 1 + else: + alpha = weight - tgt_mr + # signal approximation + yr = mixed_spec[..., 0] + yi = mixed_spec[..., 1] + loss1 = torch.sum(alpha * torch.pow((mr * yr - mi * yi) - clean_spec[..., 0], 2)) \ + + torch.sum(alpha * torch.pow((mr * yi + mi * yr) - clean_spec[..., 1], 2)) + # mask approximation + loss2 = torch.sum(alpha * torch.pow(mr - tgt_mr, 2)) \ + + torch.sum(alpha * torch.pow(mi - tgt_mi, 2)) + loss = 0.5 * (loss1 + loss2) / torch.sum(nframes) + return loss + + def crm_miso_loss_dlen(mixed, clean, masks, nframes): + return crm_loss_dlen(mixed[..., 0], clean[..., 0], masks, nframes) + + def mimo_loss_dlen(mixed, clean, masks, nframes): + chs = mixed.shape[-1] + D = masks.shape[2] // chs + loss = psm_vad_loss_dlen(mixed[..., 0], clean[..., 0], masks[..., :D], + nframes) + for ch in range(1, chs): + loss1 = psm_vad_loss_dlen(mixed[..., ch], clean[..., ch], + masks[..., ch * D:ch * D + D], nframes) + loss = loss + loss1 + return loss / chs + + def spec_loss_dlen(mixed, clean, spec, nframes): + clean_spec = stft(clean).permute([0, 2, 1, 3]) + clean_spec = clean_spec / 32768 + + D = spec.shape[2] // 2 + spec_est = torch.cat([spec[..., :D, None], spec[..., D:, None]], + dim=-1) + loss = spectrum_loss(clean_spec, spec_est, nframes) + return loss + + if loss_func == 'psm_vad_loss_dlen': + return psm_vad_loss_dlen + elif loss_func == 'sisdr_loss_dlen': + return sisdr_loss_dlen + elif loss_func == 'sisdr_freq_loss_dlen': + return sisdr_freq_loss_dlen + elif loss_func == 'crm_loss_dlen': + return crm_loss_dlen + elif loss_func == 'modulation_loss': + return modulation_loss + elif loss_func == 'wav2vec_loss': + return wav2vec_loss + elif loss_func == 'mimo_loss_dlen': + return mimo_loss_dlen + elif loss_func == 'spec_loss_dlen': + return spec_loss_dlen + elif loss_func == 'sa_loss_dlen': + return sa_loss_dlen + else: + print('error loss func') + return None diff --git a/modelscope/models/audio/network/modulation_loss.py b/modelscope/models/audio/network/modulation_loss.py new file mode 100644 index 00000000..a45ddead --- /dev/null +++ b/modelscope/models/audio/network/modulation_loss.py @@ -0,0 +1,248 @@ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchaudio.transforms import MelScale + + +class ModulationDomainLossModule(torch.nn.Module): + """Modulation-domain loss function developed in [1] for supervised speech enhancement + + In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram + as the input spectrogram representation. + Specific parameter details are in the paper and in the example below + + Parameters + ---------- + modulation_kernels: nn.Module + Differentiable module that transforms a spectrogram representation to the modulation domain + + modulation_domain = modulation_kernels(input_tf_representation) + Input Spectrogram representation (B, T, F) ---> |(M) modulation_kernels|--->Modulation Domain(B, M, T', F') + + norm: boolean + Normalizes the modulation domain representation to be 0 mean across time + + [1] T. Vuong, Y. Xia, and R. M. Stern, “A modulation-domain lossfor neural-network-based real-time + speech enhancement” + Accepted ICASSP 2021, https://arxiv.org/abs/2102.07330 + + + """ + + def __init__(self, modulation_kernels, norm=True): + super(ModulationDomainLossModule, self).__init__() + + self.modulation_kernels = modulation_kernels + self.mse = nn.MSELoss(reduce=False) + self.norm = norm + + def forward(self, enhanced_spect, clean_spect, weight=None): + """Calculate modulation-domain loss + Args: + enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels). + clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels). + Returns: + Tensor: Modulation-domain loss value. + """ + + clean_mod = self.modulation_kernels(clean_spect) + enhanced_mod = self.modulation_kernels(enhanced_spect) + + if self.norm: + mean_clean_mod = torch.mean(clean_mod, dim=2) + mean_enhanced_mod = torch.mean(enhanced_mod, dim=2) + + clean_mod = clean_mod - mean_clean_mod.unsqueeze(2) + enhanced_mod = enhanced_mod - mean_enhanced_mod.unsqueeze(2) + + if weight is None: + alpha = 1 + else: # TF-mask weight + alpha = 1 + torch.sum(weight, dim=-1, keepdim=True).unsqueeze(1) + mod_mse_loss = self.mse(enhanced_mod, clean_mod) * alpha + mod_mse_loss = torch.mean( + torch.sum(mod_mse_loss, dim=(1, 2, 3)) + / torch.sum(clean_mod**2, dim=(1, 2, 3))) + + return mod_mse_loss + + +class ModulationDomainNCCLossModule(torch.nn.Module): + """Modulation-domain loss function developed in [1] for supervised speech enhancement + + # Speech Intelligibility Prediction Using Spectro-Temporal Modulation Analysis - based off of this + + In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram + as the input spectrogram representation. + Specific parameter details are in the paper and in the example below + + Parameters + ---------- + modulation_kernels: nn.Module + Differentiable module that transforms a spectrogram representation to the modulation domain + + modulation_domain = modulation_kernels(input_tf_representation) + Input Spectrogram representation(B, T, F) --- (M) modulation_kernels---> Modulation Domain(B, M, T', F') + + [1] + + """ + + def __init__(self, modulation_kernels): + super(ModulationDomainNCCLossModule, self).__init__() + + self.modulation_kernels = modulation_kernels + self.mse = nn.MSELoss(reduce=False) + + def forward(self, enhanced_spect, clean_spect): + """Calculate modulation-domain loss + Args: + enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels). + clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels). + Returns: + Tensor: Modulation-domain loss value. + """ + + clean_mod = self.modulation_kernels(clean_spect) + enhanced_mod = self.modulation_kernels(enhanced_spect) + mean_clean_mod = torch.mean(clean_mod, dim=2) + mean_enhanced_mod = torch.mean(enhanced_mod, dim=2) + + normalized_clean = clean_mod - mean_clean_mod.unsqueeze(2) + normalized_enhanced = enhanced_mod - mean_enhanced_mod.unsqueeze(2) + + inner_product = torch.sum( + normalized_clean * normalized_enhanced, dim=2) + normalized_denom = (torch.sum( + normalized_clean * normalized_clean, dim=2))**.5 * (torch.sum( + normalized_enhanced * normalized_enhanced, dim=2))**.5 + + ncc = inner_product / normalized_denom + mod_mse_loss = torch.mean((ncc - 1.0)**2) + + return mod_mse_loss + + +class GaborSTRFConv(nn.Module): + """Gabor-STRF-based cross-correlation kernel.""" + + def __init__(self, + supn, + supk, + nkern, + rates=None, + scales=None, + norm_strf=True, + real_only=False): + """Instantiate a Gabor-based STRF convolution layer. + Parameters + ---------- + supn: int + Time support in number of frames. Also the window length. + supk: int + Frequency support in number of channels. Also the window length. + nkern: int + Number of kernels, each with a learnable rate and scale. + rates: list of float, None + Initial values for temporal modulation. + scales: list of float, None + Initial values for spectral modulation. + norm_strf: Boolean + Normalize STRF kernels to be unit length + real_only: Boolean + If True, nkern REAL gabor-STRF kernels + If False, nkern//2 REAL and nkern//2 IMAGINARY gabor-STRF kernels + """ + super(GaborSTRFConv, self).__init__() + self.numN = supn + self.numK = supk + self.numKern = nkern + self.real_only = real_only + self.norm_strf = norm_strf + + if not real_only: + nkern = nkern // 2 + + if supk % 2 == 0: # force odd number + supk += 1 + self.supk = torch.arange(supk, dtype=torch.float32) + if supn % 2 == 0: # force odd number + supn += 1 + self.supn = torch.arange(supn, dtype=self.supk.dtype) + self.padding = (supn // 2, supk // 2) + # Set up learnable parameters + # for param in (rates, scales): + # assert (not param) or len(param) == nkern + if not rates: + + rates = torch.rand(nkern) * math.pi / 2.0 + + if not scales: + + scales = (torch.rand(nkern) * 2.0 - 1.0) * math.pi / 2.0 + + self.rates_ = nn.Parameter(torch.Tensor(rates)) + self.scales_ = nn.Parameter(torch.Tensor(scales)) + + def strfs(self): + """Make STRFs using the current parameters.""" + + if self.supn.device != self.rates_.device: # for first run + self.supn = self.supn.to(self.rates_.device) + self.supk = self.supk.to(self.rates_.device) + n0, k0 = self.padding + + nwind = .5 - .5 * \ + torch.cos(2 * math.pi * (self.supn + 1) / (len(self.supn) + 1)) + kwind = .5 - .5 * \ + torch.cos(2 * math.pi * (self.supk + 1) / (len(self.supk) + 1)) + + new_wind = torch.matmul((nwind).unsqueeze(-1), (kwind).unsqueeze(0)) + + n_n_0 = self.supn - n0 + k_k_0 = self.supk - k0 + n_mult = torch.matmul( + n_n_0.unsqueeze(1), + torch.ones((1, len(self.supk))).type(torch.FloatTensor).to( + self.rates_.device)) + k_mult = torch.matmul( + torch.ones((len(self.supn), + 1)).type(torch.FloatTensor).to(self.rates_.device), + k_k_0.unsqueeze(0)) + + inside = self.rates_.unsqueeze(1).unsqueeze( + 1) * n_mult + self.scales_.unsqueeze(1).unsqueeze(1) * k_mult + real_strf = torch.cos(inside) * new_wind.unsqueeze(0) + + if self.real_only: + final_strf = real_strf + + else: + imag_strf = torch.sin(inside) * new_wind.unsqueeze(0) + final_strf = torch.cat([real_strf, imag_strf], dim=0) + + if self.norm_strf: + final_strf = final_strf / (torch.sum( + final_strf**2, dim=(1, 2)).unsqueeze(1).unsqueeze(2))**.5 + + return final_strf + + def forward(self, sigspec): + """Forward pass a batch of (real) spectra [Batch x Time x Frequency].""" + if len(sigspec.shape) == 2: # expand batch dimension if single eg + sigspec = sigspec.unsqueeze(0) + strfs = self.strfs().unsqueeze(1).type_as(sigspec) + out = F.conv2d(sigspec.unsqueeze(1), strfs, padding=self.padding) + return out + + def __repr__(self): + """Gabor filter""" + report = """ + +++++ Gabor Filter Kernels [{}], supn[{}], supk[{}] real only [{}] norm strf [{}] +++++ + + """.format(self.numKern, self.numN, self.numK, self.real_only, + self.norm_strf) + + return report diff --git a/modelscope/models/audio/network/se_net.py b/modelscope/models/audio/network/se_net.py new file mode 100644 index 00000000..54808043 --- /dev/null +++ b/modelscope/models/audio/network/se_net.py @@ -0,0 +1,483 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..layers.activations import RectifiedLinear, Sigmoid +from ..layers.affine_transform import AffineTransform +from ..layers.deep_fsmn import DeepFsmn +from ..layers.uni_deep_fsmn import Conv2d, UniDeepFsmn + + +class MaskNet(nn.Module): + + def __init__(self, + indim, + outdim, + layers=9, + hidden_dim=128, + hidden_dim2=None, + lorder=20, + rorder=0, + dilation=1, + layer_norm=False, + dropout=0, + crm=False, + vad=False, + linearout=False): + super(MaskNet, self).__init__() + + self.linear1 = AffineTransform(indim, hidden_dim) + self.relu = RectifiedLinear(hidden_dim, hidden_dim) + if hidden_dim2 is None: + hidden_dim2 = hidden_dim + + if rorder == 0: + repeats = [ + UniDeepFsmn( + hidden_dim, + hidden_dim, + lorder, + hidden_dim2, + dilation=dilation, + layer_norm=layer_norm, + dropout=dropout) for i in range(layers) + ] + else: + repeats = [ + DeepFsmn( + hidden_dim, + hidden_dim, + lorder, + rorder, + hidden_dim2, + layer_norm=layer_norm, + dropout=dropout) for i in range(layers) + ] + self.deepfsmn = nn.Sequential(*repeats) + + self.linear2 = AffineTransform(hidden_dim, outdim) + + self.crm = crm + if self.crm: + self.sig = nn.Tanh() + else: + self.sig = Sigmoid(outdim, outdim) + + self.vad = vad + if self.vad: + self.linear3 = AffineTransform(hidden_dim, 1) + + self.layers = layers + self.linearout = linearout + if self.linearout and self.vad: + print('Warning: not supported nnet') + + def forward(self, feat, ctl=None): + x1 = self.linear1(feat) + x2 = self.relu(x1) + if ctl is not None: + ctl = min(ctl, self.layers - 1) + for i in range(ctl): + x2 = self.deepfsmn[i](x2) + mask = self.sig(self.linear2(x2)) + if self.vad: + vad = torch.sigmoid(self.linear3(x2)) + return mask, vad + else: + return mask + x3 = self.deepfsmn(x2) + if self.linearout: + return self.linear2(x3) + mask = self.sig(self.linear2(x3)) + if self.vad: + vad = torch.sigmoid(self.linear3(x3)) + return mask, vad + else: + return mask + + def to_kaldi_nnet(self): + re_str = '' + re_str += '\n' + re_str += self.linear1.to_kaldi_nnet() + re_str += self.relu.to_kaldi_nnet() + for dfsmn in self.deepfsmn: + re_str += dfsmn.to_kaldi_nnet() + re_str += self.linear2.to_kaldi_nnet() + re_str += self.sig.to_kaldi_nnet() + re_str += '\n' + + return re_str + + def to_raw_nnet(self, fid): + self.linear1.to_raw_nnet(fid) + for dfsmn in self.deepfsmn: + dfsmn.to_raw_nnet(fid) + self.linear2.to_raw_nnet(fid) + + +class StageNet(nn.Module): + + def __init__(self, + indim, + outdim, + layers=9, + layers2=6, + hidden_dim=128, + lorder=20, + rorder=0, + layer_norm=False, + dropout=0, + crm=False, + vad=False, + linearout=False): + super(StageNet, self).__init__() + + self.stage1 = nn.ModuleList() + self.stage2 = nn.ModuleList() + layer = nn.Sequential(nn.Linear(indim, hidden_dim), nn.ReLU()) + self.stage1.append(layer) + for i in range(layers): + layer = UniDeepFsmn( + hidden_dim, + hidden_dim, + lorder, + hidden_dim, + layer_norm=layer_norm, + dropout=dropout) + self.stage1.append(layer) + layer = nn.Sequential(nn.Linear(hidden_dim, 321), nn.Sigmoid()) + self.stage1.append(layer) + # stage2 + layer = nn.Sequential(nn.Linear(321 + indim, hidden_dim), nn.ReLU()) + self.stage2.append(layer) + for i in range(layers2): + layer = UniDeepFsmn( + hidden_dim, + hidden_dim, + lorder, + hidden_dim, + layer_norm=layer_norm, + dropout=dropout) + self.stage2.append(layer) + layer = nn.Sequential( + nn.Linear(hidden_dim, outdim), + nn.Sigmoid() if not crm else nn.Tanh()) + self.stage2.append(layer) + self.crm = crm + self.vad = vad + self.linearout = linearout + self.window = torch.hamming_window(640, periodic=False).cuda() + self.freezed = False + + def freeze(self): + if not self.freezed: + for param in self.stage1.parameters(): + param.requires_grad = False + self.freezed = True + print('freezed stage1') + + def forward(self, feat, mixture, ctl=None): + if ctl == 'off': + x = feat + for i in range(len(self.stage1)): + x = self.stage1[i](x) + return x + else: + self.freeze() + x = feat + for i in range(len(self.stage1)): + x = self.stage1[i](x) + + spec = torch.stft( + mixture / 32768, + 640, + 320, + 640, + self.window, + center=False, + return_complex=True) + spec = torch.view_as_real(spec).permute([0, 2, 1, 3]) + specmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) + est = x * specmag + y = torch.cat([est, feat], dim=-1) + for i in range(len(self.stage2)): + y = self.stage2[i](y) + return y + + +class Unet(nn.Module): + + def __init__(self, + indim, + outdim, + layers=9, + dims=[256] * 4, + lorder=20, + rorder=0, + dilation=1, + layer_norm=False, + dropout=0, + crm=False, + vad=False, + linearout=False): + super(Unet, self).__init__() + + self.linear1 = AffineTransform(indim, dims[0]) + self.relu = RectifiedLinear(dims[0], dims[0]) + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + for i in range(len(dims) - 1): + layer = nn.Sequential( + nn.Linear(dims[i], dims[i + 1]), nn.ReLU(), + nn.Linear(dims[i + 1], dims[i + 1], bias=False), + Conv2d( + dims[i + 1], + dims[i + 1], + lorder, + groups=dims[i + 1], + skip_connect=True)) + self.encoder.append(layer) + for i in range(len(dims) - 1, 0, -1): + layer = nn.Sequential( + nn.Linear(dims[i] * 2, dims[i - 1]), nn.ReLU(), + nn.Linear(dims[i - 1], dims[i - 1], bias=False), + Conv2d( + dims[i - 1], + dims[i - 1], + lorder, + groups=dims[i - 1], + skip_connect=True)) + self.decoder.append(layer) + self.tf = nn.ModuleList() + for i in range(layers - 2 * (len(dims) - 1)): + layer = nn.Sequential( + nn.Linear(dims[-1], dims[-1]), nn.ReLU(), + nn.Linear(dims[-1], dims[-1], bias=False), + Conv2d( + dims[-1], + dims[-1], + lorder, + groups=dims[-1], + skip_connect=True)) + self.tf.append(layer) + + self.linear2 = AffineTransform(dims[0], outdim) + self.crm = crm + self.act = nn.Tanh() if self.crm else nn.Sigmoid() + self.vad = False + self.layers = layers + self.linearout = linearout + + def forward(self, x, ctl=None): + x = self.linear1(x) + x = self.relu(x) + + encoder_out = [] + for i in range(len(self.encoder)): + x = self.encoder[i](x) + encoder_out.append(x) + for i in range(len(self.tf)): + x = self.tf[i](x) + for i in range(len(self.decoder)): + x = torch.cat([x, encoder_out[-1 - i]], dim=-1) + x = self.decoder[i](x) + + x = self.linear2(x) + if self.linearout: + return x + return self.act(x) + + +class BranchNet(nn.Module): + + def __init__(self, + indim, + outdim, + layers=9, + hidden_dim=256, + lorder=20, + rorder=0, + dilation=1, + layer_norm=False, + dropout=0, + crm=False, + vad=False, + linearout=False): + super(BranchNet, self).__init__() + + self.linear1 = AffineTransform(indim, hidden_dim) + self.relu = RectifiedLinear(hidden_dim, hidden_dim) + + self.convs = nn.ModuleList() + self.deepfsmn = nn.ModuleList() + self.FREQ = nn.ModuleList() + self.TIME = nn.ModuleList() + self.br1 = nn.ModuleList() + self.br2 = nn.ModuleList() + for i in range(layers): + ''' + layer = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim, bias=False), + Conv2d(hidden_dim, hidden_dim, lorder, + groups=hidden_dim, skip_connect=True) + ) + self.deepfsmn.append(layer) + ''' + layer = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU()) + self.FREQ.append(layer) + ''' + layer = nn.GRU(hidden_dim, hidden_dim, + batch_first=True, + bidirectional=False) + self.TIME.append(layer) + + layer = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim//2, bias=False), + Conv2d(hidden_dim//2, hidden_dim//2, lorder, + groups=hidden_dim//2, skip_connect=True) + ) + self.br1.append(layer) + layer = nn.GRU(hidden_dim, hidden_dim//2, + batch_first=True, + bidirectional=False) + self.br2.append(layer) + ''' + + self.linear2 = AffineTransform(hidden_dim, outdim) + self.crm = crm + self.act = nn.Tanh() if self.crm else nn.Sigmoid() + self.vad = False + self.layers = layers + self.linearout = linearout + + def forward(self, x, ctl=None): + return self.forward_branch(x) + + def forward_sepconv(self, x): + x = torch.unsqueeze(x, 1) + for i in range(len(self.convs)): + x = self.convs[i](x) + x = F.relu(x) + B, C, H, W = x.shape + x = x.permute(0, 2, 1, 3) + x = torch.reshape(x, [B, H, C * W]) + x = self.linear1(x) + x = self.relu(x) + for i in range(self.layers): + x = self.deepfsmn[i](x) + x + x = self.linear2(x) + return self.act(x) + + def forward_branch(self, x): + x = self.linear1(x) + x = self.relu(x) + for i in range(self.layers): + z = self.FREQ[i](x) + x = z + x + x = self.linear2(x) + if self.linearout: + return x + return self.act(x) + + +class TACNet(nn.Module): + ''' transform average concatenate for ad hoc dr + ''' + + def __init__(self, + indim, + outdim, + layers=9, + hidden_dim=128, + lorder=20, + rorder=0, + crm=False, + vad=False, + linearout=False): + super(TACNet, self).__init__() + + self.linear1 = AffineTransform(indim, hidden_dim) + self.relu = RectifiedLinear(hidden_dim, hidden_dim) + + if rorder == 0: + repeats = [ + UniDeepFsmn(hidden_dim, hidden_dim, lorder, hidden_dim) + for i in range(layers) + ] + else: + repeats = [ + DeepFsmn(hidden_dim, hidden_dim, lorder, rorder, hidden_dim) + for i in range(layers) + ] + self.deepfsmn = nn.Sequential(*repeats) + + self.ch_transform = nn.ModuleList([]) + self.ch_average = nn.ModuleList([]) + self.ch_concat = nn.ModuleList([]) + for i in range(layers): + self.ch_transform.append( + nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU())) + self.ch_average.append( + nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU())) + self.ch_concat.append( + nn.Sequential( + nn.Linear(hidden_dim * 2, hidden_dim), nn.PReLU())) + + self.linear2 = AffineTransform(hidden_dim, outdim) + + self.crm = crm + if self.crm: + self.sig = nn.Tanh() + else: + self.sig = Sigmoid(outdim, outdim) + + self.vad = vad + if self.vad: + self.linear3 = AffineTransform(hidden_dim, 1) + + self.layers = layers + self.linearout = linearout + if self.linearout and self.vad: + print('Warning: not supported nnet') + + def forward(self, feat, ctl=None): + B, T, F = feat.shape + # assume 4ch + ch = 4 + zlist = [] + for c in range(ch): + z = self.linear1(feat[..., c * (F // 4):(c + 1) * (F // 4)]) + z = self.relu(z) + zlist.append(z) + for i in range(self.layers): + # forward + for c in range(ch): + zlist[c] = self.deepfsmn[i](zlist[c]) + + # transform + olist = [] + for c in range(ch): + z = self.ch_transform[i](zlist[c]) + olist.append(z) + # average + avg = 0 + for c in range(ch): + avg = avg + olist[c] + avg = avg / ch + avg = self.ch_average[i](avg) + # concate + for c in range(ch): + tac = torch.cat([olist[c], avg], dim=-1) + tac = self.ch_concat[i](tac) + zlist[c] = zlist[c] + tac + + for c in range(ch): + zlist[c] = self.sig(self.linear2(zlist[c])) + mask = torch.cat(zlist, dim=-1) + return mask + + def to_kaldi_nnet(self): + pass diff --git a/modelscope/models/audio/tts/__init__.py b/modelscope/models/audio/tts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/audio/tts/am/__init__.py b/modelscope/models/audio/tts/am/__init__.py new file mode 100644 index 00000000..2ebbda1c --- /dev/null +++ b/modelscope/models/audio/tts/am/__init__.py @@ -0,0 +1 @@ +from .sambert_hifi_16k import * # noqa F403 diff --git a/modelscope/models/audio/tts/am/models/__init__.py b/modelscope/models/audio/tts/am/models/__init__.py new file mode 100755 index 00000000..9e198e7a --- /dev/null +++ b/modelscope/models/audio/tts/am/models/__init__.py @@ -0,0 +1,8 @@ +from .robutrans import RobuTrans + + +def create_model(name, hparams): + if name == 'robutrans': + return RobuTrans(hparams) + else: + raise Exception('Unknown model: ' + name) diff --git a/modelscope/models/audio/tts/am/models/compat.py b/modelscope/models/audio/tts/am/models/compat.py new file mode 100755 index 00000000..bb810841 --- /dev/null +++ b/modelscope/models/audio/tts/am/models/compat.py @@ -0,0 +1,82 @@ +"""Functions for compatibility with different TensorFlow versions.""" + +import tensorflow as tf + + +def is_tf2(): + """Returns ``True`` if running TensorFlow 2.0.""" + return tf.__version__.startswith('2') + + +def tf_supports(symbol): + """Returns ``True`` if TensorFlow defines :obj:`symbol`.""" + return _string_to_tf_symbol(symbol) is not None + + +def tf_any(*symbols): + """Returns the first supported symbol.""" + for symbol in symbols: + module = _string_to_tf_symbol(symbol) + if module is not None: + return module + return None + + +def tf_compat(v2=None, v1=None): # pylint: disable=invalid-name + """Returns the compatible symbol based on the current TensorFlow version. + + Args: + v2: The candidate v2 symbol name. + v1: The candidate v1 symbol name. + + Returns: + A TensorFlow symbol. + + Raises: + ValueError: if no symbol can be found. + """ + candidates = [] + if v2 is not None: + candidates.append(v2) + if v1 is not None: + candidates.append(v1) + candidates.append('compat.v1.%s' % v1) + symbol = tf_any(*candidates) + if symbol is None: + raise ValueError('Failure to resolve the TensorFlow symbol') + return symbol + + +def name_from_variable_scope(name=''): + """Creates a name prefixed by the current variable scope.""" + var_scope = tf_compat(v1='get_variable_scope')().name + compat_name = '' + if name: + compat_name = '%s/' % name + if var_scope: + compat_name = '%s/%s' % (var_scope, compat_name) + return compat_name + + +def reuse(): + """Returns ``True`` if the current variable scope is marked for reuse.""" + return tf_compat(v1='get_variable_scope')().reuse + + +def _string_to_tf_symbol(symbol): + modules = symbol.split('.') + namespace = tf + for module in modules: + namespace = getattr(namespace, module, None) + if namespace is None: + return None + return namespace + + +# pylint: disable=invalid-name +gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy') +gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists') +gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile') +is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor') +logging = tf_compat(v1='logging') +nest = tf_compat(v2='nest', v1='contrib.framework.nest') diff --git a/modelscope/models/audio/tts/am/models/fsmn.py b/modelscope/models/audio/tts/am/models/fsmn.py new file mode 100755 index 00000000..875c27f0 --- /dev/null +++ b/modelscope/models/audio/tts/am/models/fsmn.py @@ -0,0 +1,273 @@ +import tensorflow as tf + + +def build_sequence_mask(sequence_length, + maximum_length=None, + dtype=tf.float32): + """Builds the dot product mask. + + Args: + sequence_length: The sequence length. + maximum_length: Optional size of the returned time dimension. Otherwise + it is the maximum of :obj:`sequence_length`. + dtype: The type of the mask tensor. + + Returns: + A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape + ``[batch_size, max_length]``. + """ + mask = tf.sequence_mask( + sequence_length, maxlen=maximum_length, dtype=dtype) + + return mask + + +def norm(inputs): + """Layer normalizes :obj:`inputs`.""" + return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1) + + +def pad_in_time(x, padding_shape): + """Helper function to pad a tensor in the time dimension and retain the static depth dimension. + + Agrs: + x: [Batch, Time, Frequency] + padding_length: padding size of constant value (0) before the time dimension + + return: + padded x + """ + + depth = x.get_shape().as_list()[-1] + x = tf.pad(x, [[0, 0], padding_shape, [0, 0]]) + x.set_shape((None, None, depth)) + + return x + + +def pad_in_time_right(x, padding_length): + """Helper function to pad a tensor in the time dimension and retain the static depth dimension. + + Agrs: + x: [Batch, Time, Frequency] + padding_length: padding size of constant value (0) before the time dimension + + return: + padded x + """ + depth = x.get_shape().as_list()[-1] + x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) + x.set_shape((None, None, depth)) + + return x + + +def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0): + """Implements the Transformer's "Feed Forward" layer. + + .. math:: + + ffn(x) = max(0, x*W_1 + b_1)*W_2 + + Args: + x: The input. + ffn_dim: The number of units of the nonlinear transformation. + memory_units: the number of units of linear transformation + mode: A ``tf.estimator.ModeKeys`` mode. + dropout: The probability to drop units from the inner transformation. + + Returns: + The transformed input. + """ + inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu) + inner = tf.layers.dropout( + inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN) + outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False) + + return outer + + +def drop_and_add(inputs, outputs, mode, dropout=0.0): + """Drops units in the outputs and adds the previous values. + + Args: + inputs: The input of the previous layer. + outputs: The output of the previous layer. + mode: A ``tf.estimator.ModeKeys`` mode. + dropout: The probability to drop units in :obj:`outputs`. + + Returns: + The residual and normalized output. + """ + outputs = tf.layers.dropout(outputs, rate=dropout, training=mode) + + input_dim = inputs.get_shape().as_list()[-1] + output_dim = outputs.get_shape().as_list()[-1] + + if input_dim == output_dim: + outputs += inputs + + return outputs + + +def MemoryBlock( + inputs, + filter_size, + mode, + mask=None, + dropout=0.0, +): + """ + Define the bidirectional memory block in FSMN + + Agrs: + inputs: The output of the previous layer. [Batch, Time, Frequency] + filter_size: memory block filter size + mode: Training or Evaluation + mask: A ``tf.Tensor`` applied to the memory block output + + return: + output: 3-D tensor ([Batch, Time, Frequency]) + """ + static_shape = inputs.get_shape().as_list() + depth = static_shape[-1] + inputs = tf.expand_dims(inputs, axis=1) # [Batch, 1, Time, Frequency] + depthwise_filter = tf.get_variable( + 'depth_conv_w', + shape=[1, filter_size, depth, 1], + initializer=tf.glorot_uniform_initializer(), + dtype=tf.float32) + memory = tf.nn.depthwise_conv2d( + input=inputs, + filter=depthwise_filter, + strides=[1, 1, 1, 1], + padding='SAME', + rate=[1, 1], + data_format='NHWC') + memory = memory + inputs + output = tf.layers.dropout(memory, rate=dropout, training=mode) + output = tf.reshape( + output, + [tf.shape(output)[0], tf.shape(output)[2], depth]) + if mask is not None: + output = output * tf.expand_dims(mask, -1) + + return output + + +def MemoryBlockV2( + inputs, + filter_size, + mode, + shift=0, + mask=None, + dropout=0.0, +): + """ + Define the bidirectional memory block in FSMN + + Agrs: + inputs: The output of the previous layer. [Batch, Time, Frequency] + filter_size: memory block filter size + mode: Training or Evaluation + shift: left padding, to control delay + mask: A ``tf.Tensor`` applied to the memory block output + + return: + output: 3-D tensor ([Batch, Time, Frequency]) + """ + if mask is not None: + inputs = inputs * tf.expand_dims(mask, -1) + + static_shape = inputs.get_shape().as_list() + depth = static_shape[-1] + # padding + left_padding = int(round((filter_size - 1) / 2)) + right_padding = int((filter_size - 1) / 2) + if shift > 0: + left_padding = left_padding + shift + right_padding = right_padding - shift + pad_inputs = pad_in_time(inputs, [left_padding, right_padding]) + pad_inputs = tf.expand_dims( + pad_inputs, axis=1) # [Batch, 1, Time, Frequency] + depthwise_filter = tf.get_variable( + 'depth_conv_w', + shape=[1, filter_size, depth, 1], + initializer=tf.glorot_uniform_initializer(), + dtype=tf.float32) + memory = tf.nn.depthwise_conv2d( + input=pad_inputs, + filter=depthwise_filter, + strides=[1, 1, 1, 1], + padding='VALID', + rate=[1, 1], + data_format='NHWC') + memory = tf.reshape( + memory, + [tf.shape(memory)[0], tf.shape(memory)[2], depth]) + memory = memory + inputs + output = tf.layers.dropout(memory, rate=dropout, training=mode) + if mask is not None: + output = output * tf.expand_dims(mask, -1) + + return output + + +def UniMemoryBlock( + inputs, + filter_size, + mode, + cache=None, + mask=None, + dropout=0.0, +): + """ + Define the unidirectional memory block in FSMN + + Agrs: + inputs: The output of the previous layer. [Batch, Time, Frequency] + filter_size: memory block filter size + cache: for streaming inference + mode: Training or Evaluation + mask: A ``tf.Tensor`` applied to the memory block output + dropout: dorpout factor + return: + output: 3-D tensor ([Batch, Time, Frequency]) + """ + if cache is not None: + static_shape = cache['queries'].get_shape().as_list() + depth = static_shape[-1] + queries = tf.slice(cache['queries'], [0, 1, 0], [ + tf.shape(cache['queries'])[0], + tf.shape(cache['queries'])[1] - 1, depth + ]) + queries = tf.concat([queries, inputs], axis=1) + cache['queries'] = queries + else: + padding_length = filter_size - 1 + queries = pad_in_time(inputs, [padding_length, 0]) + + queries = tf.expand_dims(queries, axis=1) # [Batch, 1, Time, Frequency] + static_shape = queries.get_shape().as_list() + depth = static_shape[-1] + depthwise_filter = tf.get_variable( + 'depth_conv_w', + shape=[1, filter_size, depth, 1], + initializer=tf.glorot_uniform_initializer(), + dtype=tf.float32) + memory = tf.nn.depthwise_conv2d( + input=queries, + filter=depthwise_filter, + strides=[1, 1, 1, 1], + padding='VALID', + rate=[1, 1], + data_format='NHWC') + memory = tf.reshape( + memory, + [tf.shape(memory)[0], tf.shape(memory)[2], depth]) + memory = memory + inputs + output = tf.layers.dropout(memory, rate=dropout, training=mode) + if mask is not None: + output = output * tf.expand_dims(mask, -1) + + return output diff --git a/modelscope/models/audio/tts/am/models/fsmn_encoder.py b/modelscope/models/audio/tts/am/models/fsmn_encoder.py new file mode 100755 index 00000000..2c650624 --- /dev/null +++ b/modelscope/models/audio/tts/am/models/fsmn_encoder.py @@ -0,0 +1,178 @@ +import tensorflow as tf + +from . import fsmn + + +class FsmnEncoder(): + """Encoder using Fsmn + """ + + def __init__(self, + filter_size, + fsmn_num_layers, + dnn_num_layers, + num_memory_units=512, + ffn_inner_dim=2048, + dropout=0.0, + position_encoder=None): + """Initializes the parameters of the encoder. + + Args: + filter_size: the total order of memory block + fsmn_num_layers: The number of fsmn layers. + dnn_num_layers: The number of dnn layers + num_units: The number of memory units. + ffn_inner_dim: The number of units of the inner linear transformation + in the feed forward layer. + dropout: The probability to drop units from the outputs. + position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to + apply on inputs or ``None``. + """ + super(FsmnEncoder, self).__init__() + self.filter_size = filter_size + self.fsmn_num_layers = fsmn_num_layers + self.dnn_num_layers = dnn_num_layers + self.num_memory_units = num_memory_units + self.ffn_inner_dim = ffn_inner_dim + self.dropout = dropout + self.position_encoder = position_encoder + + def encode(self, inputs, sequence_length=None, mode=True): + if self.position_encoder is not None: + inputs = self.position_encoder(inputs) + + inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) + + mask = fsmn.build_sequence_mask( + sequence_length, maximum_length=tf.shape(inputs)[1]) + + state = () + + for layer in range(self.fsmn_num_layers): + with tf.variable_scope('fsmn_layer_{}'.format(layer)): + with tf.variable_scope('ffn'): + context = fsmn.feed_forward( + inputs, + self.ffn_inner_dim, + self.num_memory_units, + mode, + dropout=self.dropout) + + with tf.variable_scope('memory'): + memory = fsmn.MemoryBlock( + context, + self.filter_size, + mode, + mask=mask, + dropout=self.dropout) + + memory = fsmn.drop_and_add( + inputs, memory, mode, dropout=self.dropout) + + inputs = memory + state += (tf.reduce_mean(inputs, axis=1), ) + + for layer in range(self.dnn_num_layers): + with tf.variable_scope('dnn_layer_{}'.format(layer)): + transformed = fsmn.feed_forward( + inputs, + self.ffn_inner_dim, + self.num_memory_units, + mode, + dropout=self.dropout) + + inputs = transformed + state += (tf.reduce_mean(inputs, axis=1), ) + + outputs = inputs + return (outputs, state, sequence_length) + + +class FsmnEncoderV2(): + """Encoder using Fsmn + """ + + def __init__(self, + filter_size, + fsmn_num_layers, + dnn_num_layers, + num_memory_units=512, + ffn_inner_dim=2048, + dropout=0.0, + shift=0, + position_encoder=None): + """Initializes the parameters of the encoder. + + Args: + filter_size: the total order of memory block + fsmn_num_layers: The number of fsmn layers. + dnn_num_layers: The number of dnn layers + num_units: The number of memory units. + ffn_inner_dim: The number of units of the inner linear transformation + in the feed forward layer. + dropout: The probability to drop units from the outputs. + shift: left padding, to control delay + position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to + apply on inputs or ``None``. + """ + super(FsmnEncoderV2, self).__init__() + self.filter_size = filter_size + self.fsmn_num_layers = fsmn_num_layers + self.dnn_num_layers = dnn_num_layers + self.num_memory_units = num_memory_units + self.ffn_inner_dim = ffn_inner_dim + self.dropout = dropout + self.shift = shift + if not isinstance(shift, list): + self.shift = [shift for _ in range(self.fsmn_num_layers)] + self.position_encoder = position_encoder + + def encode(self, inputs, sequence_length=None, mode=True): + if self.position_encoder is not None: + inputs = self.position_encoder(inputs) + + inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) + + mask = fsmn.build_sequence_mask( + sequence_length, maximum_length=tf.shape(inputs)[1]) + + state = () + for layer in range(self.fsmn_num_layers): + with tf.variable_scope('fsmn_layer_{}'.format(layer)): + with tf.variable_scope('ffn'): + context = fsmn.feed_forward( + inputs, + self.ffn_inner_dim, + self.num_memory_units, + mode, + dropout=self.dropout) + + with tf.variable_scope('memory'): + memory = fsmn.MemoryBlockV2( + context, + self.filter_size, + mode, + shift=self.shift[layer], + mask=mask, + dropout=self.dropout) + + memory = fsmn.drop_and_add( + inputs, memory, mode, dropout=self.dropout) + + inputs = memory + state += (tf.reduce_mean(inputs, axis=1), ) + + for layer in range(self.dnn_num_layers): + with tf.variable_scope('dnn_layer_{}'.format(layer)): + transformed = fsmn.feed_forward( + inputs, + self.ffn_inner_dim, + self.num_memory_units, + mode, + dropout=self.dropout) + + inputs = transformed + state += (tf.reduce_mean(inputs, axis=1), ) + + outputs = inputs + return (outputs, state, sequence_length) diff --git a/modelscope/models/audio/tts/am/models/helpers.py b/modelscope/models/audio/tts/am/models/helpers.py new file mode 100755 index 00000000..f3e53277 --- /dev/null +++ b/modelscope/models/audio/tts/am/models/helpers.py @@ -0,0 +1,160 @@ +import numpy as np +import tensorflow as tf +from tensorflow.contrib.seq2seq import Helper + + +class VarTestHelper(Helper): + + def __init__(self, batch_size, inputs, dim): + with tf.name_scope('VarTestHelper'): + self._batch_size = batch_size + self._inputs = inputs + self._dim = dim + + num_steps = tf.shape(self._inputs)[1] + self._lengths = tf.tile([num_steps], [self._batch_size]) + + self._inputs = tf.roll(inputs, shift=-1, axis=1) + self._init_inputs = inputs[:, 0, :] + + @property + def batch_size(self): + return self._batch_size + + @property + def sample_ids_shape(self): + return tf.TensorShape([]) + + @property + def sample_ids_dtype(self): + return np.int32 + + def initialize(self, name=None): + return (tf.tile([False], [self._batch_size]), + _go_frames(self._batch_size, self._dim, self._init_inputs)) + + def sample(self, time, outputs, state, name=None): + return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them + + def next_inputs(self, time, outputs, state, sample_ids, name=None): + with tf.name_scope('VarTestHelper'): + finished = (time + 1 >= self._lengths) + next_inputs = tf.concat([outputs, self._inputs[:, time, :]], + axis=-1) + return (finished, next_inputs, state) + + +class VarTrainingHelper(Helper): + + def __init__(self, targets, inputs, dim): + with tf.name_scope('VarTrainingHelper'): + self._targets = targets # [N, T_in, 1] + self._batch_size = tf.shape(inputs)[0] # N + self._inputs = inputs + self._dim = dim + + num_steps = tf.shape(self._targets)[1] + self._lengths = tf.tile([num_steps], [self._batch_size]) + + self._inputs = tf.roll(inputs, shift=-1, axis=1) + self._init_inputs = inputs[:, 0, :] + + @property + def batch_size(self): + return self._batch_size + + @property + def sample_ids_shape(self): + return tf.TensorShape([]) + + @property + def sample_ids_dtype(self): + return np.int32 + + def initialize(self, name=None): + return (tf.tile([False], [self._batch_size]), + _go_frames(self._batch_size, self._dim, self._init_inputs)) + + def sample(self, time, outputs, state, name=None): + return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them + + def next_inputs(self, time, outputs, state, sample_ids, name=None): + with tf.name_scope(name or 'VarTrainingHelper'): + finished = (time + 1 >= self._lengths) + next_inputs = tf.concat( + [self._targets[:, time, :], self._inputs[:, time, :]], axis=-1) + return (finished, next_inputs, state) + + +class VarTrainingSSHelper(Helper): + + def __init__(self, targets, inputs, dim, global_step, schedule_begin, + alpha, decay_steps): + with tf.name_scope('VarTrainingSSHelper'): + self._targets = targets # [N, T_in, 1] + self._batch_size = tf.shape(inputs)[0] # N + self._inputs = inputs + self._dim = dim + + num_steps = tf.shape(self._targets)[1] + self._lengths = tf.tile([num_steps], [self._batch_size]) + + self._inputs = tf.roll(inputs, shift=-1, axis=1) + self._init_inputs = inputs[:, 0, :] + + # for schedule sampling + self._global_step = global_step + self._schedule_begin = schedule_begin + self._alpha = alpha + self._decay_steps = decay_steps + + @property + def batch_size(self): + return self._batch_size + + @property + def sample_ids_shape(self): + return tf.TensorShape([]) + + @property + def sample_ids_dtype(self): + return np.int32 + + def initialize(self, name=None): + self._ratio = _tf_decay(self._global_step, self._schedule_begin, + self._alpha, self._decay_steps) + return (tf.tile([False], [self._batch_size]), + _go_frames(self._batch_size, self._dim, self._init_inputs)) + + def sample(self, time, outputs, state, name=None): + return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them + + def next_inputs(self, time, outputs, state, sample_ids, name=None): + with tf.name_scope(name or 'VarTrainingHelper'): + finished = (time + 1 >= self._lengths) + next_inputs_tmp = tf.cond( + tf.less( + tf.random_uniform([], minval=0, maxval=1, + dtype=tf.float32), self._ratio), + lambda: self._targets[:, time, :], lambda: outputs) + next_inputs = tf.concat( + [next_inputs_tmp, self._inputs[:, time, :]], axis=-1) + return (finished, next_inputs, state) + + +def _go_frames(batch_size, dim, init_inputs): + '''Returns all-zero frames for a given batch size and output dimension''' + return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs], + axis=-1) + + +def _tf_decay(global_step, schedule_begin, alpha, decay_steps): + tfr = tf.train.exponential_decay( + 1.0, + global_step=global_step - schedule_begin, + decay_steps=decay_steps, + decay_rate=alpha, + name='tfr_decay') + final_tfr = tf.cond( + tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr) + return final_tfr diff --git a/modelscope/models/audio/tts/am/models/modules.py b/modelscope/models/audio/tts/am/models/modules.py new file mode 100755 index 00000000..1433fd7e --- /dev/null +++ b/modelscope/models/audio/tts/am/models/modules.py @@ -0,0 +1,461 @@ +import tensorflow as tf +from tensorflow.contrib.cudnn_rnn import CudnnLSTM +from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops +from tensorflow.contrib.rnn import LSTMBlockCell + + +def encoder_prenet(inputs, + n_conv_layers, + filters, + kernel_size, + dense_units, + is_training, + mask=None, + scope='encoder_prenet'): + x = inputs + with tf.variable_scope(scope): + for i in range(n_conv_layers): + x = conv1d( + x, + filters, + kernel_size, + is_training, + activation=tf.nn.relu, + dropout=True, + mask=mask, + scope='conv1d_{}'.format(i)) + x = tf.layers.dense( + x, units=dense_units, activation=None, name='dense') + return x + + +def decoder_prenet(inputs, + prenet_units, + dense_units, + is_training, + scope='decoder_prenet'): + x = inputs + with tf.variable_scope(scope): + for i, units in enumerate(prenet_units): + x = tf.layers.dense( + x, + units=units, + activation=tf.nn.relu, + name='dense_{}'.format(i)) + x = tf.layers.dropout( + x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) + x = tf.layers.dense( + x, units=dense_units, activation=None, name='dense') + return x + + +def encoder(inputs, + input_lengths, + n_conv_layers, + filters, + kernel_size, + lstm_units, + is_training, + embedded_inputs_speaker, + mask=None, + scope='encoder'): + with tf.variable_scope(scope): + x = conv_and_lstm( + inputs, + input_lengths, + n_conv_layers, + filters, + kernel_size, + lstm_units, + is_training, + embedded_inputs_speaker, + mask=mask) + return x + + +def prenet(inputs, prenet_units, is_training, scope='prenet'): + x = inputs + with tf.variable_scope(scope): + for i, units in enumerate(prenet_units): + x = tf.layers.dense( + x, + units=units, + activation=tf.nn.relu, + name='dense_{}'.format(i)) + x = tf.layers.dropout( + x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) + return x + + +def postnet_residual_ulstm(inputs, + n_conv_layers, + filters, + kernel_size, + lstm_units, + output_units, + is_training, + scope='postnet_residual_ulstm'): + with tf.variable_scope(scope): + x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, + lstm_units, is_training) + x = conv1d( + x, + output_units, + kernel_size, + is_training, + activation=None, + dropout=False, + scope='conv1d_{}'.format(n_conv_layers - 1)) + return x + + +def postnet_residual_lstm(inputs, + n_conv_layers, + filters, + kernel_size, + lstm_units, + output_units, + is_training, + scope='postnet_residual_lstm'): + with tf.variable_scope(scope): + x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size, + lstm_units, is_training) + x = conv1d( + x, + output_units, + kernel_size, + is_training, + activation=None, + dropout=False, + scope='conv1d_{}'.format(n_conv_layers - 1)) + return x + + +def postnet_linear_ulstm(inputs, + n_conv_layers, + filters, + kernel_size, + lstm_units, + output_units, + is_training, + scope='postnet_linear'): + with tf.variable_scope(scope): + x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, + lstm_units, is_training) + x = tf.layers.dense(x, units=output_units) + return x + + +def postnet_linear_lstm(inputs, + n_conv_layers, + filters, + kernel_size, + lstm_units, + output_units, + output_lengths, + is_training, + embedded_inputs_speaker2, + mask=None, + scope='postnet_linear'): + with tf.variable_scope(scope): + x = conv_and_lstm_dec( + inputs, + output_lengths, + n_conv_layers, + filters, + kernel_size, + lstm_units, + is_training, + embedded_inputs_speaker2, + mask=mask) + x = tf.layers.dense(x, units=output_units) + return x + + +def postnet_linear(inputs, + n_conv_layers, + filters, + kernel_size, + lstm_units, + output_units, + output_lengths, + is_training, + embedded_inputs_speaker2, + mask=None, + scope='postnet_linear'): + with tf.variable_scope(scope): + x = conv_dec( + inputs, + output_lengths, + n_conv_layers, + filters, + kernel_size, + lstm_units, + is_training, + embedded_inputs_speaker2, + mask=mask) + return x + + +def conv_and_lstm(inputs, + sequence_lengths, + n_conv_layers, + filters, + kernel_size, + lstm_units, + is_training, + embedded_inputs_speaker, + mask=None, + scope='conv_and_lstm'): + x = inputs + with tf.variable_scope(scope): + for i in range(n_conv_layers): + x = conv1d( + x, + filters, + kernel_size, + is_training, + activation=tf.nn.relu, + dropout=True, + mask=mask, + scope='conv1d_{}'.format(i)) + + x = tf.concat([x, embedded_inputs_speaker], axis=2) + + outputs, states = tf.nn.bidirectional_dynamic_rnn( + LSTMBlockCell(lstm_units), + LSTMBlockCell(lstm_units), + x, + sequence_length=sequence_lengths, + dtype=tf.float32) + x = tf.concat(outputs, axis=-1) + + return x + + +def conv_and_lstm_dec(inputs, + sequence_lengths, + n_conv_layers, + filters, + kernel_size, + lstm_units, + is_training, + embedded_inputs_speaker2, + mask=None, + scope='conv_and_lstm'): + x = inputs + with tf.variable_scope(scope): + for i in range(n_conv_layers): + x = conv1d( + x, + filters, + kernel_size, + is_training, + activation=tf.nn.relu, + dropout=True, + mask=mask, + scope='conv1d_{}'.format(i)) + + x = tf.concat([x, embedded_inputs_speaker2], axis=2) + + outputs, states = tf.nn.bidirectional_dynamic_rnn( + LSTMBlockCell(lstm_units), + LSTMBlockCell(lstm_units), + x, + sequence_length=sequence_lengths, + dtype=tf.float32) + x = tf.concat(outputs, axis=-1) + return x + + +def conv_dec(inputs, + sequence_lengths, + n_conv_layers, + filters, + kernel_size, + lstm_units, + is_training, + embedded_inputs_speaker2, + mask=None, + scope='conv_and_lstm'): + x = inputs + with tf.variable_scope(scope): + for i in range(n_conv_layers): + x = conv1d( + x, + filters, + kernel_size, + is_training, + activation=tf.nn.relu, + dropout=True, + mask=mask, + scope='conv1d_{}'.format(i)) + x = tf.concat([x, embedded_inputs_speaker2], axis=2) + return x + + +def conv_and_ulstm(inputs, + sequence_lengths, + n_conv_layers, + filters, + kernel_size, + lstm_units, + is_training, + scope='conv_and_ulstm'): + x = inputs + with tf.variable_scope(scope): + for i in range(n_conv_layers): + x = conv1d( + x, + filters, + kernel_size, + is_training, + activation=tf.nn.relu, + dropout=True, + scope='conv1d_{}'.format(i)) + + outputs, states = tf.nn.dynamic_rnn( + LSTMBlockCell(lstm_units), + x, + sequence_length=sequence_lengths, + dtype=tf.float32) + + return outputs + + +def conv1d(inputs, + filters, + kernel_size, + is_training, + activation=None, + dropout=False, + mask=None, + scope='conv1d'): + with tf.variable_scope(scope): + if mask is not None: + inputs = inputs * tf.expand_dims(mask, -1) + x = tf.layers.conv1d( + inputs, filters=filters, kernel_size=kernel_size, padding='same') + if mask is not None: + x = x * tf.expand_dims(mask, -1) + + x = tf.layers.batch_normalization(x, training=is_training) + if activation is not None: + x = activation(x) + if dropout: + x = tf.layers.dropout(x, rate=0.5, training=is_training) + return x + + +def conv1d_dp(inputs, + filters, + kernel_size, + is_training, + activation=None, + dropout=False, + dropoutrate=0.5, + mask=None, + scope='conv1d'): + with tf.variable_scope(scope): + if mask is not None: + inputs = inputs * tf.expand_dims(mask, -1) + x = tf.layers.conv1d( + inputs, filters=filters, kernel_size=kernel_size, padding='same') + if mask is not None: + x = x * tf.expand_dims(mask, -1) + + x = tf.contrib.layers.layer_norm(x) + if activation is not None: + x = activation(x) + if dropout: + x = tf.layers.dropout(x, rate=dropoutrate, training=is_training) + return x + + +def duration_predictor(inputs, + n_conv_layers, + filters, + kernel_size, + lstm_units, + input_lengths, + is_training, + embedded_inputs_speaker, + mask=None, + scope='duration_predictor'): + with tf.variable_scope(scope): + x = inputs + for i in range(n_conv_layers): + x = conv1d_dp( + x, + filters, + kernel_size, + is_training, + activation=tf.nn.relu, + dropout=True, + dropoutrate=0.1, + mask=mask, + scope='conv1d_{}'.format(i)) + + x = tf.concat([x, embedded_inputs_speaker], axis=2) + + outputs, states = tf.nn.bidirectional_dynamic_rnn( + LSTMBlockCell(lstm_units), + LSTMBlockCell(lstm_units), + x, + sequence_length=input_lengths, + dtype=tf.float32) + x = tf.concat(outputs, axis=-1) + + x = tf.layers.dense(x, units=1) + x = tf.nn.relu(x) + return x + + +def duration_predictor2(inputs, + n_conv_layers, + filters, + kernel_size, + input_lengths, + is_training, + mask=None, + scope='duration_predictor'): + with tf.variable_scope(scope): + x = inputs + for i in range(n_conv_layers): + x = conv1d_dp( + x, + filters, + kernel_size, + is_training, + activation=tf.nn.relu, + dropout=True, + dropoutrate=0.1, + mask=mask, + scope='conv1d_{}'.format(i)) + + x = tf.layers.dense(x, units=1) + x = tf.nn.relu(x) + return x + + +def conv_prenet(inputs, + n_conv_layers, + filters, + kernel_size, + is_training, + mask=None, + scope='conv_prenet'): + x = inputs + with tf.variable_scope(scope): + for i in range(n_conv_layers): + x = conv1d( + x, + filters, + kernel_size, + is_training, + activation=tf.nn.relu, + dropout=True, + mask=mask, + scope='conv1d_{}'.format(i)) + + return x diff --git a/modelscope/models/audio/tts/am/models/position.py b/modelscope/models/audio/tts/am/models/position.py new file mode 100755 index 00000000..bca658dd --- /dev/null +++ b/modelscope/models/audio/tts/am/models/position.py @@ -0,0 +1,174 @@ +"""Define position encoder classes.""" + +import abc +import math + +import tensorflow as tf + +from .reducer import SumReducer + + +class PositionEncoder(tf.keras.layers.Layer): + """Base class for position encoders.""" + + def __init__(self, reducer=None, **kwargs): + """Initializes the position encoder. + Args: + reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position + encodings. Defaults to :class:`opennmt.layers.SumReducer`. + **kwargs: Additional layer keyword arguments. + """ + super(PositionEncoder, self).__init__(**kwargs) + if reducer is None: + reducer = SumReducer(dtype=kwargs.get('dtype')) + self.reducer = reducer + + def call(self, inputs, position=None): # pylint: disable=arguments-differ + """Add position encodings to :obj:`inputs`. + Args: + inputs: The inputs to encode. + position: The single position to encode, to use when this layer is called + step by step. + Returns: + A ``tf.Tensor`` whose shape depends on the configured ``reducer``. + """ + batch_size = tf.shape(inputs)[0] + timesteps = tf.shape(inputs)[1] + input_dim = inputs.shape[-1].value + positions = tf.range(timesteps) + 1 if position is None else [position] + position_encoding = self._encode([positions], input_dim) + position_encoding = tf.tile(position_encoding, [batch_size, 1, 1]) + return self.reducer([inputs, position_encoding]) + + @abc.abstractmethod + def _encode(self, positions, depth): + """Creates position encodings. + Args: + positions: The positions to encode of shape :math:`[B, ...]`. + depth: The encoding depth :math:`D`. + Returns: + A ``tf.Tensor`` of shape :math:`[B, ..., D]`. + """ + raise NotImplementedError() + + +class PositionEmbedder(PositionEncoder): + """Encodes position with a lookup table.""" + + def __init__(self, maximum_position=128, reducer=None, **kwargs): + """Initializes the position encoder. + Args: + maximum_position: The maximum position to embed. Positions greater + than this value will be set to :obj:`maximum_position`. + reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position + encodings. Defaults to :class:`opennmt.layers.SumReducer`. + **kwargs: Additional layer keyword arguments. + """ + super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs) + self.maximum_position = maximum_position + self.embedding = None + + def build(self, input_shape): + shape = [self.maximum_position + 1, input_shape[-1]] + self.embedding = self.add_weight('position_embedding', shape) + super(PositionEmbedder, self).build(input_shape) + + def _encode(self, positions, depth): + positions = tf.minimum(positions, self.maximum_position) + return tf.nn.embedding_lookup(self.embedding, positions) + + +class SinusoidalPositionEncoder(PositionEncoder): + """Encodes positions with sine waves as described in + https://arxiv.org/abs/1706.03762. + """ + + def _encode(self, positions, depth): + if depth % 2 != 0: + raise ValueError( + 'SinusoidalPositionEncoder expects the depth to be divisble ' + 'by 2 but got %d' % depth) + + batch_size = tf.shape(positions)[0] + positions = tf.cast(positions, tf.float32) + + log_timescale_increment = math.log(10000) / (depth / 2 - 1) + inv_timescales = tf.exp( + tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment) + inv_timescales = tf.reshape( + tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2]) + scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims( + inv_timescales, 1) + encoding = tf.concat( + [tf.sin(scaled_time), tf.cos(scaled_time)], axis=2) + return tf.cast(encoding, self.dtype) + + +class SinusodalPositionalEncoding(tf.keras.layers.Layer): + + def __init__(self, name='SinusodalPositionalEncoding'): + super(SinusodalPositionalEncoding, self).__init__(name=name) + + @staticmethod + def positional_encoding(len, dim, step=1.): + """ + :param len: int scalar + :param dim: int scalar + :param step: + :return: position embedding + """ + pos_mat = tf.tile( + tf.expand_dims( + tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32) + * step, + axis=-1), [1, dim]) + dim_mat = tf.tile( + tf.expand_dims( + tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), + axis=0), [len, 1]) + dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) + pos_encoding = tf.where( # [time, dims] + tf.math.equal(tf.math.mod(dim_mat_int, 2), 0), + x=tf.math.sin( + pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), + y=tf.math.cos(pos_mat + / tf.pow(10000., + (dim_mat - 1) / tf.cast(dim, tf.float32)))) + return pos_encoding + + +class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer): + + def __init__(self, name='BatchSinusodalPositionalEncoding'): + super(BatchSinusodalPositionalEncoding, self).__init__(name=name) + + @staticmethod + def positional_encoding(batch_size, len, dim, pos_mat, step=1.): + """ + :param len: int scalar + :param dim: int scalar + :param step: + :param pos_mat: [B, len] = [len, 1] * dim + :return: position embedding + """ + pos_mat = tf.tile( + tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1), + [1, 1, dim]) # [B, len, dim] + + dim_mat = tf.tile( + tf.expand_dims( + tf.expand_dims( + tf.range( + 0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), + axis=0), + axis=0), [batch_size, len, 1]) # [B, len, dim] + + dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) + pos_encoding = tf.where( # [B, time, dims] + tf.math.equal(tf.mod(dim_mat_int, 2), 0), + x=tf.math.sin( + pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), + y=tf.math.cos(pos_mat + / tf.pow(10000., + (dim_mat - 1) / tf.cast(dim, tf.float32)))) + return pos_encoding diff --git a/modelscope/models/audio/tts/am/models/reducer.py b/modelscope/models/audio/tts/am/models/reducer.py new file mode 100755 index 00000000..a4c9ae17 --- /dev/null +++ b/modelscope/models/audio/tts/am/models/reducer.py @@ -0,0 +1,155 @@ +"""Define reducers: objects that merge inputs.""" + +import abc +import functools + +import tensorflow as tf + + +def pad_in_time(x, padding_length): + """Helper function to pad a tensor in the time dimension and retain the static depth dimension.""" + return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) + + +def align_in_time(x, length): + """Aligns the time dimension of :obj:`x` with :obj:`length`.""" + time_dim = tf.shape(x)[1] + return tf.cond( + tf.less(time_dim, length), + true_fn=lambda: pad_in_time(x, length - time_dim), + false_fn=lambda: x[:, :length]) + + +def pad_with_identity(x, + sequence_length, + max_sequence_length, + identity_values=0, + maxlen=None): + """Pads a tensor with identity values up to :obj:`max_sequence_length`. + Args: + x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``. + sequence_length: The true sequence length of :obj:`x`. + max_sequence_length: The sequence length up to which the tensor must contain + :obj:`identity values`. + identity_values: The identity value. + maxlen: Size of the output time dimension. Default is the maximum value in + obj:`max_sequence_length`. + Returns: + A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``. + """ + if maxlen is None: + maxlen = tf.reduce_max(max_sequence_length) + + mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype) + mask = tf.expand_dims(mask, axis=-1) + mask_combined = tf.sequence_mask( + max_sequence_length, maxlen=maxlen, dtype=x.dtype) + mask_combined = tf.expand_dims(mask_combined, axis=-1) + + identity_mask = mask_combined * (1.0 - mask) + + x = pad_in_time(x, maxlen - tf.shape(x)[1]) + x = x * mask + (identity_mask * identity_values) + + return x + + +def pad_n_with_identity(inputs, sequence_lengths, identity_values=0): + """Pads each input tensors with identity values up to + ``max(sequence_lengths)`` for each batch. + Args: + inputs: A list of ``tf.Tensor``. + sequence_lengths: A list of sequence length. + identity_values: The identity value. + Returns: + A tuple ``(padded, max_sequence_length)`` which are respectively a list of + ``tf.Tensor`` where each tensor are padded with identity and the combined + sequence length. + """ + max_sequence_length = tf.reduce_max(sequence_lengths, axis=0) + maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs]) + padded = [ + pad_with_identity( + x, + length, + max_sequence_length, + identity_values=identity_values, + maxlen=maxlen) for x, length in zip(inputs, sequence_lengths) + ] + return padded, max_sequence_length + + +class Reducer(tf.keras.layers.Layer): + """Base class for reducers.""" + + def zip_and_reduce(self, x, y): + """Zips the :obj:`x` with :obj:`y` structures together and reduces all + elements. If the structures are nested, they will be flattened first. + Args: + x: The first structure. + y: The second structure. + Returns: + The same structure as :obj:`x` and :obj:`y` where each element from + :obj:`x` is reduced with the correspond element from :obj:`y`. + Raises: + ValueError: if the two structures are not the same. + """ + tf.nest.assert_same_structure(x, y) + x_flat = tf.nest.flatten(x) + y_flat = tf.nest.flatten(y) + reduced = list(map(self, zip(x_flat, y_flat))) + return tf.nest.pack_sequence_as(x, reduced) + + def call(self, inputs, sequence_length=None): # pylint: disable=arguments-differ + """Reduces all input elements. + Args: + inputs: A list of ``tf.Tensor``. + sequence_length: The length of each input, if reducing sequences. + Returns: + If :obj:`sequence_length` is set, a tuple + ``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor`` + only. + """ + if sequence_length is None: + return self.reduce(inputs) + else: + return self.reduce_sequence( + inputs, sequence_lengths=sequence_length) + + @abc.abstractmethod + def reduce(self, inputs): + """See :meth:`opennmt.layers.Reducer.__call__`.""" + raise NotImplementedError() + + @abc.abstractmethod + def reduce_sequence(self, inputs, sequence_lengths): + """See :meth:`opennmt.layers.Reducer.__call__`.""" + raise NotImplementedError() + + +class SumReducer(Reducer): + """A reducer that sums the inputs.""" + + def reduce(self, inputs): + if len(inputs) == 1: + return inputs[0] + if len(inputs) == 2: + return inputs[0] + inputs[1] + return tf.add_n(inputs) + + def reduce_sequence(self, inputs, sequence_lengths): + padded, combined_length = pad_n_with_identity( + inputs, sequence_lengths, identity_values=0) + return self.reduce(padded), combined_length + + +class MultiplyReducer(Reducer): + """A reducer that multiplies the inputs.""" + + def reduce(self, inputs): + return functools.reduce(lambda a, x: a * x, inputs) + + def reduce_sequence(self, inputs, sequence_lengths): + padded, combined_length = pad_n_with_identity( + inputs, sequence_lengths, identity_values=1) + return self.reduce(padded), combined_length diff --git a/modelscope/models/audio/tts/am/models/rnn_wrappers.py b/modelscope/models/audio/tts/am/models/rnn_wrappers.py new file mode 100755 index 00000000..8f0d612b --- /dev/null +++ b/modelscope/models/audio/tts/am/models/rnn_wrappers.py @@ -0,0 +1,240 @@ +import numpy as np +import tensorflow as tf +from tensorflow.contrib.rnn import RNNCell +from tensorflow.contrib.seq2seq import AttentionWrapperState +from tensorflow.python.ops import rnn_cell_impl + +from .modules import prenet + + +class VarPredictorCell(RNNCell): + '''Wrapper wrapper knock knock.''' + + def __init__(self, var_predictor_cell, is_training, dim, prenet_units): + super(VarPredictorCell, self).__init__() + self._var_predictor_cell = var_predictor_cell + self._is_training = is_training + self._dim = dim + self._prenet_units = prenet_units + + @property + def state_size(self): + return tuple([self.output_size, self._var_predictor_cell.state_size]) + + @property + def output_size(self): + return self._dim + + def zero_state(self, batch_size, dtype): + return tuple([ + rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, + dtype), + self._var_predictor_cell.zero_state(batch_size, dtype) + ]) + + def call(self, inputs, state): + '''Run the Tacotron2 super decoder cell.''' + super_cell_out, decoder_state = state + + # split + prenet_input = inputs[:, 0:self._dim] + encoder_output = inputs[:, self._dim:] + + # prenet and concat + prenet_output = prenet( + prenet_input, + self._prenet_units, + self._is_training, + scope='var_prenet') + decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) + + # decoder LSTM/GRU + new_super_cell_out, new_decoder_state = self._var_predictor_cell( + decoder_input, decoder_state) + + # projection + new_super_cell_out = tf.layers.dense( + new_super_cell_out, units=self._dim) + + new_states = tuple([new_super_cell_out, new_decoder_state]) + + return new_super_cell_out, new_states + + +class DurPredictorCell(RNNCell): + '''Wrapper wrapper knock knock.''' + + def __init__(self, var_predictor_cell, is_training, dim, prenet_units): + super(DurPredictorCell, self).__init__() + self._var_predictor_cell = var_predictor_cell + self._is_training = is_training + self._dim = dim + self._prenet_units = prenet_units + + @property + def state_size(self): + return tuple([self.output_size, self._var_predictor_cell.state_size]) + + @property + def output_size(self): + return self._dim + + def zero_state(self, batch_size, dtype): + return tuple([ + rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, + dtype), + self._var_predictor_cell.zero_state(batch_size, dtype) + ]) + + def call(self, inputs, state): + '''Run the Tacotron2 super decoder cell.''' + super_cell_out, decoder_state = state + + # split + prenet_input = inputs[:, 0:self._dim] + encoder_output = inputs[:, self._dim:] + + # prenet and concat + prenet_output = prenet( + prenet_input, + self._prenet_units, + self._is_training, + scope='dur_prenet') + decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) + + # decoder LSTM/GRU + new_super_cell_out, new_decoder_state = self._var_predictor_cell( + decoder_input, decoder_state) + + # projection + new_super_cell_out = tf.layers.dense( + new_super_cell_out, units=self._dim) + new_super_cell_out = tf.nn.relu(new_super_cell_out) + # new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1) + + new_states = tuple([new_super_cell_out, new_decoder_state]) + + return new_super_cell_out, new_states + + +class DurPredictorCECell(RNNCell): + '''Wrapper wrapper knock knock.''' + + def __init__(self, var_predictor_cell, is_training, dim, prenet_units, + max_dur, dur_embedding_dim): + super(DurPredictorCECell, self).__init__() + self._var_predictor_cell = var_predictor_cell + self._is_training = is_training + self._dim = dim + self._prenet_units = prenet_units + self._max_dur = max_dur + self._dur_embedding_dim = dur_embedding_dim + + @property + def state_size(self): + return tuple([self.output_size, self._var_predictor_cell.state_size]) + + @property + def output_size(self): + return self._max_dur + + def zero_state(self, batch_size, dtype): + return tuple([ + rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, + dtype), + self._var_predictor_cell.zero_state(batch_size, dtype) + ]) + + def call(self, inputs, state): + '''Run the Tacotron2 super decoder cell.''' + super_cell_out, decoder_state = state + + # split + prenet_input = tf.squeeze( + tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1) # [N] + prenet_input = tf.one_hot( + prenet_input, self._max_dur, on_value=1.0, off_value=0.0, + axis=-1) # [N, 120] + prenet_input = tf.layers.dense( + prenet_input, units=self._dur_embedding_dim) + encoder_output = inputs[:, self._dim:] + + # prenet and concat + prenet_output = prenet( + prenet_input, + self._prenet_units, + self._is_training, + scope='dur_prenet') + decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) + + # decoder LSTM/GRU + new_super_cell_out, new_decoder_state = self._var_predictor_cell( + decoder_input, decoder_state) + + # projection + new_super_cell_out = tf.layers.dense( + new_super_cell_out, units=self._max_dur) # [N, 120] + new_super_cell_out = tf.nn.softmax(new_super_cell_out) # [N, 120] + + new_states = tuple([new_super_cell_out, new_decoder_state]) + + return new_super_cell_out, new_states + + +class VarPredictorCell2(RNNCell): + '''Wrapper wrapper knock knock.''' + + def __init__(self, var_predictor_cell, is_training, dim, prenet_units): + super(VarPredictorCell2, self).__init__() + self._var_predictor_cell = var_predictor_cell + self._is_training = is_training + self._dim = dim + self._prenet_units = prenet_units + + @property + def state_size(self): + return tuple([self.output_size, self._var_predictor_cell.state_size]) + + @property + def output_size(self): + return self._dim + + def zero_state(self, batch_size, dtype): + return tuple([ + rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, + dtype), + self._var_predictor_cell.zero_state(batch_size, dtype) + ]) + + def call(self, inputs, state): + '''Run the Tacotron2 super decoder cell.''' + super_cell_out, decoder_state = state + + # split + prenet_input = inputs[:, 0:self._dim] + encoder_output = inputs[:, self._dim:] + + # prenet and concat + prenet_output = prenet( + prenet_input, + self._prenet_units, + self._is_training, + scope='var_prenet') + decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) + + # decoder LSTM/GRU + new_super_cell_out, new_decoder_state = self._var_predictor_cell( + decoder_input, decoder_state) + + # projection + new_super_cell_out = tf.layers.dense( + new_super_cell_out, units=self._dim) + + # split and relu + new_super_cell_out = tf.concat([ + tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:] + ], axis=-1) # yapf:disable + + new_states = tuple([new_super_cell_out, new_decoder_state]) + + return new_super_cell_out, new_states diff --git a/modelscope/models/audio/tts/am/models/robutrans.py b/modelscope/models/audio/tts/am/models/robutrans.py new file mode 100755 index 00000000..34b4da7a --- /dev/null +++ b/modelscope/models/audio/tts/am/models/robutrans.py @@ -0,0 +1,760 @@ +import tensorflow as tf +from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell +from tensorflow.contrib.seq2seq import BasicDecoder +from tensorflow.python.ops.ragged.ragged_util import repeat + +from .fsmn_encoder import FsmnEncoderV2 +from .helpers import VarTestHelper, VarTrainingHelper +from .modules import conv_prenet, decoder_prenet, encoder_prenet +from .position import (BatchSinusodalPositionalEncoding, + SinusodalPositionalEncoding) +from .rnn_wrappers import DurPredictorCell, VarPredictorCell +from .self_attention_decoder import SelfAttentionDecoder +from .self_attention_encoder import SelfAttentionEncoder + + +class RobuTrans(): + + def __init__(self, hparams): + self._hparams = hparams + + def initialize(self, + inputs, + inputs_emotion, + inputs_speaker, + input_lengths, + output_lengths=None, + mel_targets=None, + durations=None, + pitch_contours=None, + uv_masks=None, + pitch_scales=None, + duration_scales=None, + energy_contours=None, + energy_scales=None): + '''Initializes the model for inference. + + Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. + + Args: + inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of + steps in the input time series, and values are character IDs + input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths + of each sequence in inputs. + output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths + of each sequence in outputs. + mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number + of steps in the output time series, M is num_mels, and values are entries in the mel + spectrogram. Only needed for training. + ''' + with tf.variable_scope('inference') as _: + is_training = mel_targets is not None + batch_size = tf.shape(inputs)[0] + hp = self._hparams + + input_mask = None + if input_lengths is not None and is_training: + input_mask = tf.sequence_mask( + input_lengths, tf.shape(inputs)[1], dtype=tf.float32) + + if input_mask is not None: + inputs = inputs * tf.expand_dims(input_mask, -1) + + # speaker embedding + embedded_inputs_speaker = tf.layers.dense( + inputs_speaker, + 32, + activation=None, + use_bias=False, + kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) + + # emotion embedding + embedded_inputs_emotion = tf.layers.dense( + inputs_emotion, + 32, + activation=None, + use_bias=False, + kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) + + # symbol embedding + with tf.variable_scope('Embedding'): + embedded_inputs = tf.layers.dense( + inputs, + hp.embedding_dim, + activation=None, + use_bias=False, + kernel_initializer=tf.truncated_normal_initializer( + stddev=0.5)) + + # Encoder + with tf.variable_scope('Encoder'): + Encoder = SelfAttentionEncoder( + num_layers=hp.encoder_num_layers, + num_units=hp.encoder_num_units, + num_heads=hp.encoder_num_heads, + ffn_inner_dim=hp.encoder_ffn_inner_dim, + dropout=hp.encoder_dropout, + attention_dropout=hp.encoder_attention_dropout, + relu_dropout=hp.encoder_relu_dropout) + encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode( + embedded_inputs, + sequence_length=input_lengths, + mode=is_training) + encoder_outputs = tf.layers.dense( + encoder_outputs, + hp.encoder_projection_units, + activation=None, + use_bias=False, + kernel_initializer=tf.truncated_normal_initializer( + stddev=0.5)) + + # pitch and energy + var_inputs = tf.concat([ + encoder_outputs, embedded_inputs_speaker, + embedded_inputs_emotion + ], 2) + if input_mask is not None: + var_inputs = var_inputs * tf.expand_dims(input_mask, -1) + + with tf.variable_scope('Pitch_Predictor'): + Pitch_Predictor_FSMN = FsmnEncoderV2( + filter_size=hp.predictor_filter_size, + fsmn_num_layers=hp.predictor_fsmn_num_layers, + dnn_num_layers=hp.predictor_dnn_num_layers, + num_memory_units=hp.predictor_num_memory_units, + ffn_inner_dim=hp.predictor_ffn_inner_dim, + dropout=hp.predictor_dropout, + shift=hp.predictor_shift, + position_encoder=None) + pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode( + tf.concat([ + encoder_outputs, embedded_inputs_speaker, + embedded_inputs_emotion + ], 2), + sequence_length=input_lengths, + mode=is_training) + pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( + LSTMBlockCell(hp.predictor_lstm_units), + LSTMBlockCell(hp.predictor_lstm_units), + pitch_contour_outputs, + sequence_length=input_lengths, + dtype=tf.float32) + pitch_contour_outputs = tf.concat( + pitch_contour_outputs, axis=-1) + pitch_contour_outputs = tf.layers.dense( + pitch_contour_outputs, units=1) # [N, T_in, 1] + pitch_contour_outputs = tf.squeeze( + pitch_contour_outputs, axis=2) # [N, T_in] + + with tf.variable_scope('Energy_Predictor'): + Energy_Predictor_FSMN = FsmnEncoderV2( + filter_size=hp.predictor_filter_size, + fsmn_num_layers=hp.predictor_fsmn_num_layers, + dnn_num_layers=hp.predictor_dnn_num_layers, + num_memory_units=hp.predictor_num_memory_units, + ffn_inner_dim=hp.predictor_ffn_inner_dim, + dropout=hp.predictor_dropout, + shift=hp.predictor_shift, + position_encoder=None) + energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode( + tf.concat([ + encoder_outputs, embedded_inputs_speaker, + embedded_inputs_emotion + ], 2), + sequence_length=input_lengths, + mode=is_training) + energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( + LSTMBlockCell(hp.predictor_lstm_units), + LSTMBlockCell(hp.predictor_lstm_units), + energy_contour_outputs, + sequence_length=input_lengths, + dtype=tf.float32) + energy_contour_outputs = tf.concat( + energy_contour_outputs, axis=-1) + energy_contour_outputs = tf.layers.dense( + energy_contour_outputs, units=1) # [N, T_in, 1] + energy_contour_outputs = tf.squeeze( + energy_contour_outputs, axis=2) # [N, T_in] + + if is_training: + pitch_embeddings = tf.expand_dims( + pitch_contours, axis=2) # [N, T_in, 1] + pitch_embeddings = tf.layers.conv1d( + pitch_embeddings, + filters=hp.encoder_projection_units, + kernel_size=9, + padding='same', + name='pitch_embeddings') # [N, T_in, 32] + + energy_embeddings = tf.expand_dims( + energy_contours, axis=2) # [N, T_in, 1] + energy_embeddings = tf.layers.conv1d( + energy_embeddings, + filters=hp.encoder_projection_units, + kernel_size=9, + padding='same', + name='energy_embeddings') # [N, T_in, 32] + else: + pitch_contour_outputs *= pitch_scales + pitch_embeddings = tf.expand_dims( + pitch_contour_outputs, axis=2) # [N, T_in, 1] + pitch_embeddings = tf.layers.conv1d( + pitch_embeddings, + filters=hp.encoder_projection_units, + kernel_size=9, + padding='same', + name='pitch_embeddings') # [N, T_in, 32] + + energy_contour_outputs *= energy_scales + energy_embeddings = tf.expand_dims( + energy_contour_outputs, axis=2) # [N, T_in, 1] + energy_embeddings = tf.layers.conv1d( + energy_embeddings, + filters=hp.encoder_projection_units, + kernel_size=9, + padding='same', + name='energy_embeddings') # [N, T_in, 32] + + encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings + + # duration + dur_inputs = tf.concat([ + encoder_outputs_, embedded_inputs_speaker, + embedded_inputs_emotion + ], 2) + if input_mask is not None: + dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1) + with tf.variable_scope('Duration_Predictor'): + duration_predictor_cell = MultiRNNCell([ + LSTMBlockCell(hp.predictor_lstm_units), + LSTMBlockCell(hp.predictor_lstm_units) + ], state_is_tuple=True) # yapf:disable + duration_output_cell = DurPredictorCell( + duration_predictor_cell, is_training, 1, + hp.predictor_prenet_units) + duration_predictor_init_state = duration_output_cell.zero_state( + batch_size=batch_size, dtype=tf.float32) + if is_training: + duration_helper = VarTrainingHelper( + tf.expand_dims( + tf.log(tf.cast(durations, tf.float32) + 1), + axis=2), dur_inputs, 1) + else: + duration_helper = VarTestHelper(batch_size, dur_inputs, 1) + ( + duration_outputs, _ + ), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode( + BasicDecoder(duration_output_cell, duration_helper, + duration_predictor_init_state), + maximum_iterations=1000) + duration_outputs = tf.squeeze( + duration_outputs, axis=2) # [N, T_in] + if input_mask is not None: + duration_outputs = duration_outputs * input_mask + duration_outputs_ = tf.exp(duration_outputs) - 1 + + # Length Regulator + with tf.variable_scope('Length_Regulator'): + if is_training: + i = tf.constant(1) + # position embedding + j = tf.constant(1) + dur_len = tf.shape(durations)[-1] + embedded_position_i = tf.range(1, durations[0, 0] + 1) + + def condition_pos(j, e): + return tf.less(j, dur_len) + + def loop_body_pos(j, embedded_position_i): + embedded_position_i = tf.concat([ + embedded_position_i, + tf.range(1, durations[0, j] + 1) + ], axis=0) # yapf:disable + return [j + 1, embedded_position_i] + + j, embedded_position_i = tf.while_loop( + condition_pos, + loop_body_pos, [j, embedded_position_i], + shape_invariants=[ + j.get_shape(), + tf.TensorShape([None]) + ]) + embedded_position = tf.reshape(embedded_position_i, + (1, -1)) + + # others + LR_outputs = repeat( + encoder_outputs_[0:1, :, :], durations[0, :], axis=1) + embedded_outputs_speaker = repeat( + embedded_inputs_speaker[0:1, :, :], + durations[0, :], + axis=1) + embedded_outputs_emotion = repeat( + embedded_inputs_emotion[0:1, :, :], + durations[0, :], + axis=1) + + def condition(i, pos, layer, s, e): + return tf.less(i, tf.shape(mel_targets)[0]) + + def loop_body(i, embedded_position, LR_outputs, + embedded_outputs_speaker, + embedded_outputs_emotion): + # position embedding + jj = tf.constant(1) + embedded_position_i = tf.range(1, durations[i, 0] + 1) + + def condition_pos_i(j, e): + return tf.less(j, dur_len) + + def loop_body_pos_i(j, embedded_position_i): + embedded_position_i = tf.concat([ + embedded_position_i, + tf.range(1, durations[i, j] + 1) + ], axis=0) # yapf:disable + return [j + 1, embedded_position_i] + + jj, embedded_position_i = tf.while_loop( + condition_pos_i, + loop_body_pos_i, [jj, embedded_position_i], + shape_invariants=[ + jj.get_shape(), + tf.TensorShape([None]) + ]) + embedded_position = tf.concat([ + embedded_position, + tf.reshape(embedded_position_i, (1, -1)) + ], 0) + + # others + LR_outputs = tf.concat([ + LR_outputs, + repeat( + encoder_outputs_[i:i + 1, :, :], + durations[i, :], + axis=1) + ], 0) + embedded_outputs_speaker = tf.concat([ + embedded_outputs_speaker, + repeat( + embedded_inputs_speaker[i:i + 1, :, :], + durations[i, :], + axis=1) + ], 0) + embedded_outputs_emotion = tf.concat([ + embedded_outputs_emotion, + repeat( + embedded_inputs_emotion[i:i + 1, :, :], + durations[i, :], + axis=1) + ], 0) + return [ + i + 1, embedded_position, LR_outputs, + embedded_outputs_speaker, embedded_outputs_emotion + ] + + i, embedded_position, LR_outputs, + embedded_outputs_speaker, + embedded_outputs_emotion = tf.while_loop( + condition, + loop_body, [ + i, embedded_position, LR_outputs, + embedded_outputs_speaker, embedded_outputs_emotion + ], + shape_invariants=[ + i.get_shape(), + tf.TensorShape([None, None]), + tf.TensorShape([None, None, None]), + tf.TensorShape([None, None, None]), + tf.TensorShape([None, None, None]) + ], + parallel_iterations=hp.batch_size) + + ori_framenum = tf.shape(mel_targets)[1] + else: + # position + j = tf.constant(1) + dur_len = tf.shape(duration_outputs_)[-1] + embedded_position_i = tf.range( + 1, + tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32) + + 1) + + def condition_pos(j, e): + return tf.less(j, dur_len) + + def loop_body_pos(j, embedded_position_i): + embedded_position_i = tf.concat([ + embedded_position_i, + tf.range( + 1, + tf.cast( + tf.round(duration_outputs_)[0, j], + tf.int32) + 1) + ], axis=0) # yapf:disable + return [j + 1, embedded_position_i] + + j, embedded_position_i = tf.while_loop( + condition_pos, + loop_body_pos, [j, embedded_position_i], + shape_invariants=[ + j.get_shape(), + tf.TensorShape([None]) + ]) + embedded_position = tf.reshape(embedded_position_i, + (1, -1)) + # others + duration_outputs_ *= duration_scales + LR_outputs = repeat( + encoder_outputs_[0:1, :, :], + tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), + axis=1) + embedded_outputs_speaker = repeat( + embedded_inputs_speaker[0:1, :, :], + tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), + axis=1) + embedded_outputs_emotion = repeat( + embedded_inputs_emotion[0:1, :, :], + tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), + axis=1) + ori_framenum = tf.shape(LR_outputs)[1] + + left = hp.outputs_per_step - tf.mod( + ori_framenum, hp.outputs_per_step) + LR_outputs = tf.cond( + tf.equal(left, + hp.outputs_per_step), lambda: LR_outputs, + lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]], + 'CONSTANT')) + embedded_outputs_speaker = tf.cond( + tf.equal(left, hp.outputs_per_step), + lambda: embedded_outputs_speaker, lambda: tf.pad( + embedded_outputs_speaker, [[0, 0], [0, left], + [0, 0]], 'CONSTANT')) + embedded_outputs_emotion = tf.cond( + tf.equal(left, hp.outputs_per_step), + lambda: embedded_outputs_emotion, lambda: tf.pad( + embedded_outputs_emotion, [[0, 0], [0, left], + [0, 0]], 'CONSTANT')) + embedded_position = tf.cond( + tf.equal(left, hp.outputs_per_step), + lambda: embedded_position, + lambda: tf.pad(embedded_position, [[0, 0], [0, left]], + 'CONSTANT')) + + # Pos_Embedding + with tf.variable_scope('Position_Embedding'): + Pos_Embedding = BatchSinusodalPositionalEncoding() + position_embeddings = Pos_Embedding.positional_encoding( + batch_size, + tf.shape(LR_outputs)[1], hp.encoder_projection_units, + embedded_position) + LR_outputs += position_embeddings + + # multi-frame + LR_outputs = tf.reshape(LR_outputs, [ + batch_size, -1, + hp.outputs_per_step * hp.encoder_projection_units + ]) + embedded_outputs_speaker = tf.reshape( + embedded_outputs_speaker, + [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] + embedded_outputs_emotion = tf.reshape( + embedded_outputs_emotion, + [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] + # [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64) + LR_outputs = tf.concat([ + LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion + ], -1) + + # auto bandwidth + if is_training: + durations_mask = tf.cast(durations, + tf.float32) * input_mask # [N, T_in] + else: + durations_mask = duration_outputs_ + X_band_width = tf.cast( + tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step), + tf.int32) + H_band_width = X_band_width + + with tf.variable_scope('Decoder'): + Decoder = SelfAttentionDecoder( + num_layers=hp.decoder_num_layers, + num_units=hp.decoder_num_units, + num_heads=hp.decoder_num_heads, + ffn_inner_dim=hp.decoder_ffn_inner_dim, + dropout=hp.decoder_dropout, + attention_dropout=hp.decoder_attention_dropout, + relu_dropout=hp.decoder_relu_dropout, + prenet_units=hp.prenet_units, + dense_units=hp.prenet_proj_units, + num_mels=hp.num_mels, + outputs_per_step=hp.outputs_per_step, + X_band_width=X_band_width, + H_band_width=H_band_width, + position_encoder=None) + if is_training: + if hp.free_run: + r = hp.outputs_per_step + init_decoder_input = tf.expand_dims( + tf.tile([[0.0]], [batch_size, hp.num_mels]), + axis=1) # [N, 1, hp.num_mels] + decoder_input_lengths = tf.cast( + output_lengths / r, tf.int32) + decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( + init_decoder_input, + maximum_iterations=tf.shape(LR_outputs)[1], + mode=is_training, + memory=LR_outputs, + memory_sequence_length=decoder_input_lengths) + else: + r = hp.outputs_per_step + decoder_input = mel_targets[:, r - 1:: + r, :] # [N, T_out / r, hp.num_mels] + init_decoder_input = tf.expand_dims( + tf.tile([[0.0]], [batch_size, hp.num_mels]), + axis=1) # [N, 1, hp.num_mels] + decoder_input = tf.concat( + [init_decoder_input, decoder_input], + axis=1) # [N, T_out / r + 1, hp.num_mels] + decoder_input = decoder_input[:, : + -1, :] # [N, T_out / r, hp.num_mels] + decoder_input_lengths = tf.cast( + output_lengths / r, tf.int32) + decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs( + decoder_input, + decoder_input_lengths, + mode=is_training, + memory=LR_outputs, + memory_sequence_length=decoder_input_lengths) + else: + init_decoder_input = tf.expand_dims( + tf.tile([[0.0]], [batch_size, hp.num_mels]), + axis=1) # [N, 1, hp.num_mels] + decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( + init_decoder_input, + maximum_iterations=tf.shape(LR_outputs)[1], + mode=is_training, + memory=LR_outputs, + memory_sequence_length=tf.expand_dims( + tf.shape(LR_outputs)[1], axis=0)) + + if is_training: + mel_outputs_ = tf.reshape(decoder_outputs, + [batch_size, -1, hp.num_mels]) + else: + mel_outputs_ = tf.reshape( + decoder_outputs, + [batch_size, -1, hp.num_mels])[:, :ori_framenum, :] + mel_outputs = mel_outputs_ + + with tf.variable_scope('Postnet'): + Postnet_FSMN = FsmnEncoderV2( + filter_size=hp.postnet_filter_size, + fsmn_num_layers=hp.postnet_fsmn_num_layers, + dnn_num_layers=hp.postnet_dnn_num_layers, + num_memory_units=hp.postnet_num_memory_units, + ffn_inner_dim=hp.postnet_ffn_inner_dim, + dropout=hp.postnet_dropout, + shift=hp.postnet_shift, + position_encoder=None) + if is_training: + postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( + mel_outputs, + sequence_length=output_lengths, + mode=is_training) + hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( + LSTMBlockCell(hp.postnet_lstm_units), + postnet_fsmn_outputs, + sequence_length=output_lengths, + dtype=tf.float32) + else: + postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( + mel_outputs, + sequence_length=[tf.shape(mel_outputs_)[1]], + mode=is_training) + hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( + LSTMBlockCell(hp.postnet_lstm_units), + postnet_fsmn_outputs, + sequence_length=[tf.shape(mel_outputs_)[1]], + dtype=tf.float32) + + mel_residual_outputs = tf.layers.dense( + hidden_lstm_outputs, units=hp.num_mels) + mel_outputs += mel_residual_outputs + + self.inputs = inputs + self.inputs_speaker = inputs_speaker + self.inputs_emotion = inputs_emotion + self.input_lengths = input_lengths + self.durations = durations + self.output_lengths = output_lengths + self.mel_outputs_ = mel_outputs_ + self.mel_outputs = mel_outputs + self.mel_targets = mel_targets + self.duration_outputs = duration_outputs + self.duration_outputs_ = duration_outputs_ + self.duration_scales = duration_scales + self.pitch_contour_outputs = pitch_contour_outputs + self.pitch_contours = pitch_contours + self.pitch_scales = pitch_scales + self.energy_contour_outputs = energy_contour_outputs + self.energy_contours = energy_contours + self.energy_scales = energy_scales + self.uv_masks_ = uv_masks + + self.embedded_inputs_emotion = embedded_inputs_emotion + self.embedding_fsmn_outputs = embedded_inputs + self.encoder_outputs = encoder_outputs + self.encoder_outputs_ = encoder_outputs_ + self.LR_outputs = LR_outputs + self.postnet_fsmn_outputs = postnet_fsmn_outputs + + self.pitch_embeddings = pitch_embeddings + self.energy_embeddings = energy_embeddings + + self.attns = attns + self.attention_x = attention_x + self.attention_h = attention_h + self.X_band_width = X_band_width + self.H_band_width = H_band_width + + def add_loss(self): + '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' + with tf.variable_scope('loss') as _: + hp = self._hparams + mask = tf.sequence_mask( + self.output_lengths, + tf.shape(self.mel_targets)[1], + dtype=tf.float32) + valid_outputs = tf.reduce_sum(mask) + + mask_input = tf.sequence_mask( + self.input_lengths, + tf.shape(self.durations)[1], + dtype=tf.float32) + valid_inputs = tf.reduce_sum(mask_input) + + # mel loss + if self.uv_masks_ is not None: + valid_outputs_mask = tf.reduce_sum( + tf.expand_dims(mask, -1) * self.uv_masks_) + self.mel_loss_ = tf.reduce_sum( + tf.abs(self.mel_targets - self.mel_outputs_) + * tf.expand_dims(mask, -1) * self.uv_masks_) / ( + valid_outputs_mask * hp.num_mels) + self.mel_loss = tf.reduce_sum( + tf.abs(self.mel_targets - self.mel_outputs) + * tf.expand_dims(mask, -1) * self.uv_masks_) / ( + valid_outputs_mask * hp.num_mels) + else: + self.mel_loss_ = tf.reduce_sum( + tf.abs(self.mel_targets - self.mel_outputs_) + * tf.expand_dims(mask, -1)) / ( + valid_outputs * hp.num_mels) + self.mel_loss = tf.reduce_sum( + tf.abs(self.mel_targets - self.mel_outputs) + * tf.expand_dims(mask, -1)) / ( + valid_outputs * hp.num_mels) + + # duration loss + self.duration_loss = tf.reduce_sum( + tf.abs( + tf.log(tf.cast(self.durations, tf.float32) + 1) + - self.duration_outputs) * mask_input) / valid_inputs + + # pitch contour loss + self.pitch_contour_loss = tf.reduce_sum( + tf.abs(self.pitch_contours - self.pitch_contour_outputs) + * mask_input) / valid_inputs + + # energy contour loss + self.energy_contour_loss = tf.reduce_sum( + tf.abs(self.energy_contours - self.energy_contour_outputs) + * mask_input) / valid_inputs + + # final loss + self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \ + + self.pitch_contour_loss + self.energy_contour_loss + + # guided attention loss + self.guided_attention_loss = tf.constant(0.0) + if hp.guided_attention: + i0 = tf.constant(0) + loss0 = tf.constant(0.0) + + def c(i, _): + return tf.less(i, tf.shape(mel_targets)[0]) + + def loop_body(i, loss): + decoder_input_lengths = tf.cast( + self.output_lengths / hp.outputs_per_step, tf.int32) + input_len = decoder_input_lengths[i] + output_len = decoder_input_lengths[i] + input_w = tf.expand_dims( + tf.range(tf.cast(input_len, dtype=tf.float32)), + axis=1) / tf.cast( + input_len, dtype=tf.float32) # [T_in, 1] + output_w = tf.expand_dims( + tf.range(tf.cast(output_len, dtype=tf.float32)), + axis=0) / tf.cast( + output_len, dtype=tf.float32) # [1, T_out] + guided_attention_w = 1.0 - tf.exp( + -(1 / hp.guided_attention_2g_squared) + * tf.square(input_w - output_w)) # [T_in, T_out] + guided_attention_w = tf.expand_dims( + guided_attention_w, axis=0) # [1, T_in, T_out] + # [hp.decoder_num_heads, T_in, T_out] + guided_attention_w = tf.tile(guided_attention_w, + [hp.decoder_num_heads, 1, 1]) + loss_i = tf.constant(0.0) + for j in range(hp.decoder_num_layers): + loss_i += tf.reduce_mean( + self.attention_h[j][i, :, :input_len, :output_len] + * guided_attention_w) + + return [tf.add(i, 1), tf.add(loss, loss_i)] + + _, loss = tf.while_loop( + c, + loop_body, + loop_vars=[i0, loss0], + parallel_iterations=hp.batch_size) + self.guided_attention_loss = loss / hp.batch_size + self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss + + def add_optimizer(self, global_step): + '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. + + Args: + global_step: int32 scalar Tensor representing current global step in training + ''' + with tf.variable_scope('optimizer') as _: + hp = self._hparams + if hp.decay_learning_rate: + self.learning_rate = _learning_rate_decay( + hp.initial_learning_rate, global_step) + else: + self.learning_rate = tf.convert_to_tensor( + hp.initial_learning_rate) + optimizer = tf.train.AdamOptimizer(self.learning_rate, + hp.adam_beta1, hp.adam_beta2) + gradients, variables = zip(*optimizer.compute_gradients(self.loss)) + self.gradients = gradients + clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) + + # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: + # https://github.com/tensorflow/tensorflow/issues/1122 + with tf.control_dependencies( + tf.get_collection(tf.GraphKeys.UPDATE_OPS)): + self.optimize = optimizer.apply_gradients( + zip(clipped_gradients, variables), global_step=global_step) + + +def _learning_rate_decay(init_lr, global_step): + # Noam scheme from tensor2tensor: + warmup_steps = 4000.0 + step = tf.cast(global_step + 1, dtype=tf.float32) + return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, + step**-0.5) diff --git a/modelscope/models/audio/tts/am/models/self_attention_decoder.py b/modelscope/models/audio/tts/am/models/self_attention_decoder.py new file mode 100755 index 00000000..4e64342c --- /dev/null +++ b/modelscope/models/audio/tts/am/models/self_attention_decoder.py @@ -0,0 +1,817 @@ +"""Define self-attention decoder.""" + +import sys + +import tensorflow as tf + +from . import compat, transformer +from .modules import decoder_prenet +from .position import SinusoidalPositionEncoder + + +class SelfAttentionDecoder(): + """Decoder using self-attention as described in + https://arxiv.org/abs/1706.03762. + """ + + def __init__(self, + num_layers, + num_units=512, + num_heads=8, + ffn_inner_dim=2048, + dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + prenet_units=256, + dense_units=128, + num_mels=80, + outputs_per_step=3, + X_band_width=None, + H_band_width=None, + position_encoder=SinusoidalPositionEncoder(), + self_attention_type='scaled_dot'): + """Initializes the parameters of the decoder. + + Args: + num_layers: The number of layers. + num_units: The number of hidden units. + num_heads: The number of heads in the multi-head attention. + ffn_inner_dim: The number of units of the inner linear transformation + in the feed forward layer. + dropout: The probability to drop units from the outputs. + attention_dropout: The probability to drop units from the attention. + relu_dropout: The probability to drop units from the ReLU activation in + the feed forward layer. + position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to + apply on inputs or ``None``. + self_attention_type: Type of self attention, "scaled_dot" or "average" (case + insensitive). + + Raises: + ValueError: if :obj:`self_attention_type` is invalid. + """ + super(SelfAttentionDecoder, self).__init__() + self.num_layers = num_layers + self.num_units = num_units + self.num_heads = num_heads + self.ffn_inner_dim = ffn_inner_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.relu_dropout = relu_dropout + self.position_encoder = position_encoder + self.self_attention_type = self_attention_type.lower() + if self.self_attention_type not in ('scaled_dot', 'average'): + raise ValueError('invalid attention type %s' + % self.self_attention_type) + if self.self_attention_type == 'average': + tf.logging.warning( + 'Support for average attention network is experimental ' + 'and may change in future versions.') + self.prenet_units = prenet_units + self.dense_units = dense_units + self.num_mels = num_mels + self.outputs_per_step = outputs_per_step + self.X_band_width = X_band_width + self.H_band_width = H_band_width + + @property + def output_size(self): + """Returns the decoder output size.""" + return self.num_units + + @property + def support_alignment_history(self): + return True + + @property + def support_multi_source(self): + return True + + def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): + cache = {} + + for layer in range(self.num_layers): + proj_cache_shape = [ + batch_size, self.num_heads, 0, self.num_units // self.num_heads + ] + layer_cache = {} + layer_cache['memory'] = [{ + 'memory_keys': + tf.zeros(proj_cache_shape, dtype=dtype), + 'memory_values': + tf.zeros(proj_cache_shape, dtype=dtype) + } for _ in range(num_sources)] + if self.self_attention_type == 'scaled_dot': + layer_cache['self_keys'] = tf.zeros( + proj_cache_shape, dtype=dtype) + layer_cache['self_values'] = tf.zeros( + proj_cache_shape, dtype=dtype) + elif self.self_attention_type == 'average': + layer_cache['prev_g'] = tf.zeros( + [batch_size, 1, self.num_units], dtype=dtype) + cache['layer_{}'.format(layer)] = layer_cache + + return cache + + def _init_attn(self, dtype=tf.float32): + attn = [] + for layer in range(self.num_layers): + attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True)) + return attn + + def _self_attention_stack(self, + inputs, + sequence_length=None, + mode=True, + cache=None, + memory=None, + memory_sequence_length=None, + step=None): + + # [N, T_out, self.dense_units] or [N, 1, self.dense_units] + prenet_outputs = decoder_prenet(inputs, self.prenet_units, + self.dense_units, mode) + if step is None: + decoder_inputs = tf.concat( + [memory, prenet_outputs], + axis=-1) # [N, T_out, memory_size + self.dense_units] + else: + decoder_inputs = tf.concat( + [memory[:, step:step + 1, :], prenet_outputs], + axis=-1) # [N, 1, memory_size + self.dense_units] + decoder_inputs = tf.layers.dense( + decoder_inputs, units=self.dense_units) + + inputs = decoder_inputs + inputs *= self.num_units**0.5 + if self.position_encoder is not None: + inputs = self.position_encoder( + inputs, position=step + 1 if step is not None else None) + + inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) + + decoder_mask = None + memory_mask = None + # last_attention = None + + X_band_width_tmp = -1 + H_band_width_tmp = -1 + if self.X_band_width is not None: + X_band_width_tmp = tf.cast( + tf.cond( + tf.less(tf.shape(memory)[1], self.X_band_width), + lambda: -1, lambda: self.X_band_width), + dtype=tf.int64) + if self.H_band_width is not None: + H_band_width_tmp = tf.cast( + tf.cond( + tf.less(tf.shape(memory)[1], self.H_band_width), + lambda: -1, lambda: self.H_band_width), + dtype=tf.int64) + + if self.self_attention_type == 'scaled_dot': + if sequence_length is not None: + decoder_mask = transformer.build_future_mask( + sequence_length, + num_heads=self.num_heads, + maximum_length=tf.shape(inputs)[1], + band=X_band_width_tmp) # [N, 1, T_out, T_out] + elif self.self_attention_type == 'average': + if cache is None: + if sequence_length is None: + sequence_length = tf.fill([tf.shape(inputs)[0]], + tf.shape(inputs)[1]) + decoder_mask = transformer.cumulative_average_mask( + sequence_length, + maximum_length=tf.shape(inputs)[1], + dtype=inputs.dtype) + + if memory is not None and not tf.contrib.framework.nest.is_sequence( + memory): + memory = (memory, ) + if memory_sequence_length is not None: + if not tf.contrib.framework.nest.is_sequence( + memory_sequence_length): + memory_sequence_length = (memory_sequence_length, ) + if step is None: + memory_mask = [ + transformer.build_history_mask( + length, + num_heads=self.num_heads, + maximum_length=tf.shape(m)[1], + band=H_band_width_tmp) + for m, length in zip(memory, memory_sequence_length) + ] + else: + memory_mask = [ + transformer.build_history_mask( + length, + num_heads=self.num_heads, + maximum_length=tf.shape(m)[1], + band=H_band_width_tmp)[:, :, step:step + 1, :] + for m, length in zip(memory, memory_sequence_length) + ] + + # last_attention = None + attns_x = [] + attns_h = [] + for layer in range(self.num_layers): + layer_name = 'layer_{}'.format(layer) + layer_cache = cache[layer_name] if cache is not None else None + with tf.variable_scope(layer_name): + if memory is not None: + for i, (mem, mask) in enumerate(zip(memory, memory_mask)): + memory_cache = None + if layer_cache is not None: + memory_cache = layer_cache['memory'][i] + scope_name = 'multi_head_{}'.format(i) + if i == 0: + scope_name = 'multi_head' + with tf.variable_scope(scope_name): + encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA( + self.num_heads, + transformer.norm(inputs), + mem, + mode, + num_units=self.num_units, + mask=decoder_mask, + mask_h=mask, + cache=layer_cache, + cache_h=memory_cache, + dropout=self.attention_dropout, + return_attention=True, + layer_name=layer_name, + X_band_width=self.X_band_width) + attns_x.append(attn_x) + attns_h.append(attn_h) + context = transformer.drop_and_add( + inputs, encoded, mode, dropout=self.dropout) + + with tf.variable_scope('ffn'): + transformed = transformer.feed_forward_ori( + transformer.norm(context), + self.ffn_inner_dim, + mode, + dropout=self.relu_dropout) + transformed = transformer.drop_and_add( + context, transformed, mode, dropout=self.dropout) + + inputs = transformed + + outputs = transformer.norm(inputs) + outputs = tf.layers.dense( + outputs, units=self.num_mels * self.outputs_per_step) + return outputs, attns_x, attns_h + + def decode_from_inputs(self, + inputs, + sequence_length, + initial_state=None, + mode=True, + memory=None, + memory_sequence_length=None): + outputs, attention_x, attention_h = self._self_attention_stack( + inputs, + sequence_length=sequence_length, + mode=mode, + memory=memory, + memory_sequence_length=memory_sequence_length) + return outputs, attention_x, attention_h + + def step_fn(self, + mode, + batch_size, + initial_state=None, + memory=None, + memory_sequence_length=None, + dtype=tf.float32): + if memory is None: + num_sources = 0 + elif tf.contrib.framework.nest.is_sequence(memory): + num_sources = len(memory) + else: + num_sources = 1 + cache = self._init_cache( + batch_size, dtype=dtype, num_sources=num_sources) + attention_x = self._init_attn(dtype=dtype) + attention_h = self._init_attn(dtype=dtype) + + def _fn(step, inputs, cache): + outputs, attention_x, attention_h = self._self_attention_stack( + inputs, + mode=mode, + cache=cache, + memory=memory, + memory_sequence_length=memory_sequence_length, + step=step) + attention_x_tmp = [] + for layer in range(len(attention_h)): + attention_x_tmp_l = tf.zeros_like(attention_h[layer]) + if self.X_band_width is not None: + pred = tf.less(step, self.X_band_width + 1) + attention_x_tmp_l_1 = tf.cond(pred, # yapf:disable + lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer], + lambda: tf.concat([ + attention_x_tmp_l[:, :, :, + :step - self.X_band_width], + attention_x_tmp_l[:, :, :, + step - self.X_band_width:step + 1] + + attention_x[layer]], + axis=-1)) # yapf:disable + attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] + attention_x_tmp.append( + tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2], + axis=-1)) + else: + attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1] + attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] + attention_x_tmp.append( + tf.concat([ + attention_x_tmp_l_1 + attention_x[layer], + attention_x_tmp_l_2 + ], axis=-1)) # yapf:disable + attention_x = attention_x_tmp + return outputs, cache, attention_x, attention_h + + return _fn, cache, attention_x, attention_h + + def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations, + mode, memory, memory_sequence_length): + batch_size = tf.shape(init_decoder_input)[0] + step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( + mode, + batch_size, + memory=memory, + memory_sequence_length=memory_sequence_length) + + outputs, attention_x, attention_h, cache = self.dynamic_decode( + step_fn, + init_decoder_input, + init_cache=init_cache, + init_attn_x=init_attn_x, + init_attn_h=init_attn_h, + maximum_iterations=maximum_iterations, + batch_size=batch_size) + return outputs, attention_x, attention_h + + def dynamic_decode_and_search_teacher_forcing(self, decoder_input, + maximum_iterations, mode, + memory, + memory_sequence_length): + batch_size = tf.shape(decoder_input)[0] + step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( + mode, + batch_size, + memory=memory, + memory_sequence_length=memory_sequence_length) + + outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing( + step_fn, + decoder_input, + init_cache=init_cache, + init_attn_x=init_attn_x, + init_attn_h=init_attn_h, + maximum_iterations=maximum_iterations, + batch_size=batch_size) + return outputs, attention_x, attention_h + + def dynamic_decode(self, + step_fn, + init_decoder_input, + init_cache=None, + init_attn_x=None, + init_attn_h=None, + maximum_iterations=None, + batch_size=None): + + def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument + return tf.less(step, maximum_iterations) + + def _body(step, cache, inputs, outputs, attention_x, attention_h): + # output: [1, 1, num_mels * r] + # attn: [1, 1, T_out] + output, cache, attn_x, attn_h = step_fn( + step, inputs, cache) # outputs, cache, attention, attns + for layer in range(len(attention_x)): + attention_x[layer] = attention_x[layer].write( + step, tf.cast(attn_x[layer], tf.float32)) + + for layer in range(len(attention_h)): + attention_h[layer] = attention_h[layer].write( + step, tf.cast(attn_h[layer], tf.float32)) + + outputs = outputs.write(step, tf.cast(output, tf.float32)) + return step + 1, cache, output[:, :, -self. + num_mels:], outputs, attention_x, attention_h + + step = tf.constant(0, dtype=tf.int32) + outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) + + _, cache, _, outputs, attention_x, attention_h = tf.while_loop( + _cond, + _body, + loop_vars=(step, init_cache, init_decoder_input, outputs, + init_attn_x, init_attn_h), + shape_invariants=(step.shape, + compat.nest.map_structure( + self._get_shape_invariants, init_cache), + compat.nest.map_structure( + self._get_shape_invariants, + init_decoder_input), tf.TensorShape(None), + compat.nest.map_structure( + self._get_shape_invariants, init_attn_x), + compat.nest.map_structure( + self._get_shape_invariants, init_attn_h)), + parallel_iterations=1, + back_prop=False, + maximum_iterations=maximum_iterations) + # element of outputs: [N, 1, num_mels * r] + outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] + outputs_stack = tf.transpose( + outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] + outputs_stack = tf.squeeze( + outputs_stack, axis=0) # [N, T_out, num_mels * r] + + attention_x_stack = [] + for layer in range(len(attention_x)): + attention_x_stack_tmp = attention_x[layer].stack( + ) # [T_out, N, H, 1, T_out] + attention_x_stack_tmp = tf.transpose( + attention_x_stack_tmp, perm=[3, 1, 2, 0, + 4]) # [1, N, H, T_out, T_out] + attention_x_stack_tmp = tf.squeeze( + attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] + attention_x_stack.append(attention_x_stack_tmp) + + attention_h_stack = [] + for layer in range(len(attention_h)): + attention_h_stack_tmp = attention_h[layer].stack( + ) # [T_out, N, H, 1, T_out] + attention_h_stack_tmp = tf.transpose( + attention_h_stack_tmp, perm=[3, 1, 2, 0, + 4]) # [1, N, H, T_out, T_out] + attention_h_stack_tmp = tf.squeeze( + attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] + attention_h_stack.append(attention_h_stack_tmp) + + return outputs_stack, attention_x_stack, attention_h_stack, cache + + def dynamic_decode_teacher_forcing(self, + step_fn, + decoder_input, + init_cache=None, + init_attn_x=None, + init_attn_h=None, + maximum_iterations=None, + batch_size=None): + + def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument + return tf.less(step, maximum_iterations) + + def _body(step, cache, inputs, outputs, attention_x, attention_h): + # output: [1, 1, num_mels * r] + # attn: [1, 1, T_out] + output, cache, attn_x, attn_h = step_fn( + step, inputs[:, step:step + 1, :], + cache) # outputs, cache, attention, attns + for layer in range(len(attention_x)): + attention_x[layer] = attention_x[layer].write( + step, tf.cast(attn_x[layer], tf.float32)) + + for layer in range(len(attention_h)): + attention_h[layer] = attention_h[layer].write( + step, tf.cast(attn_h[layer], tf.float32)) + outputs = outputs.write(step, tf.cast(output, tf.float32)) + return step + 1, cache, inputs, outputs, attention_x, attention_h + + step = tf.constant(0, dtype=tf.int32) + outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) + + _, cache, _, outputs, attention_x, attention_h = tf.while_loop( + _cond, + _body, + loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x, + init_attn_h), + shape_invariants=(step.shape, + compat.nest.map_structure( + self._get_shape_invariants, + init_cache), decoder_input.shape, + tf.TensorShape(None), + compat.nest.map_structure( + self._get_shape_invariants, init_attn_x), + compat.nest.map_structure( + self._get_shape_invariants, init_attn_h)), + parallel_iterations=1, + back_prop=False, + maximum_iterations=maximum_iterations) + # element of outputs: [N, 1, num_mels * r] + outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] + outputs_stack = tf.transpose( + outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] + outputs_stack = tf.squeeze( + outputs_stack, axis=0) # [N, T_out, num_mels * r] + + attention_x_stack = [] + for layer in range(len(attention_x)): + attention_x_stack_tmp = attention_x[layer].stack( + ) # [T_out, N, H, 1, T_out] + attention_x_stack_tmp = tf.transpose( + attention_x_stack_tmp, perm=[3, 1, 2, 0, + 4]) # [1, N, H, T_out, T_out] + attention_x_stack_tmp = tf.squeeze( + attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] + attention_x_stack.append(attention_x_stack_tmp) + + attention_h_stack = [] + for layer in range(len(attention_h)): + attention_h_stack_tmp = attention_h[layer].stack( + ) # [T_out, N, H, 1, T_out] + attention_h_stack_tmp = tf.transpose( + attention_h_stack_tmp, perm=[3, 1, 2, 0, + 4]) # [1, N, H, T_out, T_out] + attention_h_stack_tmp = tf.squeeze( + attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] + attention_h_stack.append(attention_h_stack_tmp) + + return outputs_stack, attention_x_stack, attention_h_stack, cache + + def _get_shape_invariants(self, tensor): + """Returns the shape of the tensor but sets middle dims to None.""" + if isinstance(tensor, tf.TensorArray): + shape = None + else: + shape = tensor.shape.as_list() + for i in range(1, len(shape) - 1): + shape[i] = None + return tf.TensorShape(shape) + + +class SelfAttentionDecoderOri(): + """Decoder using self-attention as described in + https://arxiv.org/abs/1706.03762. + """ + + def __init__(self, + num_layers, + num_units=512, + num_heads=8, + ffn_inner_dim=2048, + dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + position_encoder=SinusoidalPositionEncoder(), + self_attention_type='scaled_dot'): + """Initializes the parameters of the decoder. + + Args: + num_layers: The number of layers. + num_units: The number of hidden units. + num_heads: The number of heads in the multi-head attention. + ffn_inner_dim: The number of units of the inner linear transformation + in the feed forward layer. + dropout: The probability to drop units from the outputs. + attention_dropout: The probability to drop units from the attention. + relu_dropout: The probability to drop units from the ReLU activation in + the feed forward layer. + position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to + apply on inputs or ``None``. + self_attention_type: Type of self attention, "scaled_dot" or "average" (case + insensitive). + + Raises: + ValueError: if :obj:`self_attention_type` is invalid. + """ + super(SelfAttentionDecoderOri, self).__init__() + self.num_layers = num_layers + self.num_units = num_units + self.num_heads = num_heads + self.ffn_inner_dim = ffn_inner_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.relu_dropout = relu_dropout + self.position_encoder = position_encoder + self.self_attention_type = self_attention_type.lower() + if self.self_attention_type not in ('scaled_dot', 'average'): + raise ValueError('invalid attention type %s' + % self.self_attention_type) + if self.self_attention_type == 'average': + tf.logging.warning( + 'Support for average attention network is experimental ' + 'and may change in future versions.') + + @property + def output_size(self): + """Returns the decoder output size.""" + return self.num_units + + @property + def support_alignment_history(self): + return True + + @property + def support_multi_source(self): + return True + + def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): + cache = {} + + for layer in range(self.num_layers): + proj_cache_shape = [ + batch_size, self.num_heads, 0, self.num_units // self.num_heads + ] + layer_cache = {} + layer_cache['memory'] = [{ + 'memory_keys': + tf.zeros(proj_cache_shape, dtype=dtype), + 'memory_values': + tf.zeros(proj_cache_shape, dtype=dtype) + } for _ in range(num_sources)] + if self.self_attention_type == 'scaled_dot': + layer_cache['self_keys'] = tf.zeros( + proj_cache_shape, dtype=dtype) + layer_cache['self_values'] = tf.zeros( + proj_cache_shape, dtype=dtype) + elif self.self_attention_type == 'average': + layer_cache['prev_g'] = tf.zeros( + [batch_size, 1, self.num_units], dtype=dtype) + cache['layer_{}'.format(layer)] = layer_cache + + return cache + + def _self_attention_stack(self, + inputs, + sequence_length=None, + mode=True, + cache=None, + memory=None, + memory_sequence_length=None, + step=None): + inputs *= self.num_units**0.5 + if self.position_encoder is not None: + inputs = self.position_encoder( + inputs, position=step + 1 if step is not None else None) + + inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) + + decoder_mask = None + memory_mask = None + last_attention = None + + if self.self_attention_type == 'scaled_dot': + if sequence_length is not None: + decoder_mask = transformer.build_future_mask( + sequence_length, + num_heads=self.num_heads, + maximum_length=tf.shape(inputs)[1]) + elif self.self_attention_type == 'average': + if cache is None: + if sequence_length is None: + sequence_length = tf.fill([tf.shape(inputs)[0]], + tf.shape(inputs)[1]) + decoder_mask = transformer.cumulative_average_mask( + sequence_length, + maximum_length=tf.shape(inputs)[1], + dtype=inputs.dtype) + + if memory is not None and not tf.contrib.framework.nest.is_sequence( + memory): + memory = (memory, ) + if memory_sequence_length is not None: + if not tf.contrib.framework.nest.is_sequence( + memory_sequence_length): + memory_sequence_length = (memory_sequence_length, ) + memory_mask = [ + transformer.build_sequence_mask( + length, + num_heads=self.num_heads, + maximum_length=tf.shape(m)[1]) + for m, length in zip(memory, memory_sequence_length) + ] + + for layer in range(self.num_layers): + layer_name = 'layer_{}'.format(layer) + layer_cache = cache[layer_name] if cache is not None else None + with tf.variable_scope(layer_name): + if self.self_attention_type == 'scaled_dot': + with tf.variable_scope('masked_multi_head'): + encoded = transformer.multi_head_attention( + self.num_heads, + transformer.norm(inputs), + None, + mode, + num_units=self.num_units, + mask=decoder_mask, + cache=layer_cache, + dropout=self.attention_dropout) + last_context = transformer.drop_and_add( + inputs, encoded, mode, dropout=self.dropout) + elif self.self_attention_type == 'average': + with tf.variable_scope('average_attention'): + # Cumulative average. + x = transformer.norm(inputs) + y = transformer.cumulative_average( + x, + decoder_mask if cache is None else step, + cache=layer_cache) + # FFN. + y = transformer.feed_forward( + y, + self.ffn_inner_dim, + mode, + dropout=self.relu_dropout) + # Gating layer. + z = tf.layers.dense( + tf.concat([x, y], -1), self.num_units * 2) + i, f = tf.split(z, 2, axis=-1) + y = tf.sigmoid(i) * x + tf.sigmoid(f) * y + last_context = transformer.drop_and_add( + inputs, y, mode, dropout=self.dropout) + + if memory is not None: + for i, (mem, mask) in enumerate(zip(memory, memory_mask)): + memory_cache = layer_cache['memory'][i] if layer_cache is not None else None # yapf:disable + with tf.variable_scope('multi_head' if i + == 0 else 'multi_head_%d' % i): # yapf:disable + context, last_attention = transformer.multi_head_attention( + self.num_heads, + transformer.norm(last_context), + mem, + mode, + mask=mask, + cache=memory_cache, + dropout=self.attention_dropout, + return_attention=True) + last_context = transformer.drop_and_add( + last_context, + context, + mode, + dropout=self.dropout) + if i > 0: # Do not return attention in case of multi source. + last_attention = None + + with tf.variable_scope('ffn'): + transformed = transformer.feed_forward_ori( + transformer.norm(last_context), + self.ffn_inner_dim, + mode, + dropout=self.relu_dropout) + transformed = transformer.drop_and_add( + last_context, transformed, mode, dropout=self.dropout) + + inputs = transformed + + if last_attention is not None: + # The first head of the last layer is returned. + first_head_attention = last_attention[:, 0] + else: + first_head_attention = None + + outputs = transformer.norm(inputs) + return outputs, first_head_attention + + def decode_from_inputs(self, + inputs, + sequence_length, + initial_state=None, + mode=True, + memory=None, + memory_sequence_length=None): + outputs, attention = self._self_attention_stack( + inputs, + sequence_length=sequence_length, + mode=mode, + memory=memory, + memory_sequence_length=memory_sequence_length) + return outputs, None, attention + + def step_fn(self, + mode, + batch_size, + initial_state=None, + memory=None, + memory_sequence_length=None, + dtype=tf.float32): + if memory is None: + num_sources = 0 + elif tf.contrib.framework.nest.is_sequence(memory): + num_sources = len(memory) + else: + num_sources = 1 + cache = self._init_cache( + batch_size, dtype=dtype, num_sources=num_sources) + + def _fn(step, inputs, cache, mode): + inputs = tf.expand_dims(inputs, 1) + outputs, attention = self._self_attention_stack( + inputs, + mode=mode, + cache=cache, + memory=memory, + memory_sequence_length=memory_sequence_length, + step=step) + outputs = tf.squeeze(outputs, axis=1) + if attention is not None: + attention = tf.squeeze(attention, axis=1) + return outputs, cache, attention + + return _fn, cache diff --git a/modelscope/models/audio/tts/am/models/self_attention_encoder.py b/modelscope/models/audio/tts/am/models/self_attention_encoder.py new file mode 100755 index 00000000..ce4193dc --- /dev/null +++ b/modelscope/models/audio/tts/am/models/self_attention_encoder.py @@ -0,0 +1,182 @@ +"""Define the self-attention encoder.""" + +import tensorflow as tf + +from . import transformer +from .position import SinusoidalPositionEncoder + + +class SelfAttentionEncoder(): + """Encoder using self-attention as described in + https://arxiv.org/abs/1706.03762. + """ + + def __init__(self, + num_layers, + num_units=512, + num_heads=8, + ffn_inner_dim=2048, + dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + position_encoder=SinusoidalPositionEncoder()): + """Initializes the parameters of the encoder. + + Args: + num_layers: The number of layers. + num_units: The number of hidden units. + num_heads: The number of heads in the multi-head attention. + ffn_inner_dim: The number of units of the inner linear transformation + in the feed forward layer. + dropout: The probability to drop units from the outputs. + attention_dropout: The probability to drop units from the attention. + relu_dropout: The probability to drop units from the ReLU activation in + the feed forward layer. + position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to + apply on inputs or ``None``. + """ + super(SelfAttentionEncoder, self).__init__() + self.num_layers = num_layers + self.num_units = num_units + self.num_heads = num_heads + self.ffn_inner_dim = ffn_inner_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.relu_dropout = relu_dropout + self.position_encoder = position_encoder + + def encode(self, inputs, sequence_length=None, mode=True): + inputs *= self.num_units**0.5 + if self.position_encoder is not None: + inputs = self.position_encoder(inputs) + + inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) + mask = transformer.build_sequence_mask( + sequence_length, + num_heads=self.num_heads, + maximum_length=tf.shape(inputs)[1]) + + mask_FF = tf.squeeze( + transformer.build_sequence_mask( + sequence_length, maximum_length=tf.shape(inputs)[1]), + axis=1) + + state = () + + attns = [] + for layer in range(self.num_layers): + with tf.variable_scope('layer_{}'.format(layer)): + with tf.variable_scope('multi_head'): + context, attn = transformer.multi_head_attention( + self.num_heads, + transformer.norm(inputs), + None, + mode, + num_units=self.num_units, + mask=mask, + dropout=self.attention_dropout, + return_attention=True) + attns.append(attn) + context = transformer.drop_and_add( + inputs, context, mode, dropout=self.dropout) + + with tf.variable_scope('ffn'): + transformed = transformer.feed_forward( + transformer.norm(context), + self.ffn_inner_dim, + mode, + dropout=self.relu_dropout, + mask=mask_FF) + transformed = transformer.drop_and_add( + context, transformed, mode, dropout=self.dropout) + + inputs = transformed + state += (tf.reduce_mean(inputs, axis=1), ) + + outputs = transformer.norm(inputs) + return (outputs, state, sequence_length, attns) + + +class SelfAttentionEncoderOri(): + """Encoder using self-attention as described in + https://arxiv.org/abs/1706.03762. + """ + + def __init__(self, + num_layers, + num_units=512, + num_heads=8, + ffn_inner_dim=2048, + dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + position_encoder=SinusoidalPositionEncoder()): + """Initializes the parameters of the encoder. + + Args: + num_layers: The number of layers. + num_units: The number of hidden units. + num_heads: The number of heads in the multi-head attention. + ffn_inner_dim: The number of units of the inner linear transformation + in the feed forward layer. + dropout: The probability to drop units from the outputs. + attention_dropout: The probability to drop units from the attention. + relu_dropout: The probability to drop units from the ReLU activation in + the feed forward layer. + position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to + apply on inputs or ``None``. + """ + super(SelfAttentionEncoderOri, self).__init__() + self.num_layers = num_layers + self.num_units = num_units + self.num_heads = num_heads + self.ffn_inner_dim = ffn_inner_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.relu_dropout = relu_dropout + self.position_encoder = position_encoder + + def encode(self, inputs, sequence_length=None, mode=True): + inputs *= self.num_units**0.5 + if self.position_encoder is not None: + inputs = self.position_encoder(inputs) + + inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) + mask = transformer.build_sequence_mask( + sequence_length, + num_heads=self.num_heads, + maximum_length=tf.shape(inputs)[1]) # [N, 1, 1, T_out] + + state = () + + attns = [] + for layer in range(self.num_layers): + with tf.variable_scope('layer_{}'.format(layer)): + with tf.variable_scope('multi_head'): + context, attn = transformer.multi_head_attention( + self.num_heads, + transformer.norm(inputs), + None, + mode, + num_units=self.num_units, + mask=mask, + dropout=self.attention_dropout, + return_attention=True) + attns.append(attn) + context = transformer.drop_and_add( + inputs, context, mode, dropout=self.dropout) + + with tf.variable_scope('ffn'): + transformed = transformer.feed_forward_ori( + transformer.norm(context), + self.ffn_inner_dim, + mode, + dropout=self.relu_dropout) + transformed = transformer.drop_and_add( + context, transformed, mode, dropout=self.dropout) + + inputs = transformed + state += (tf.reduce_mean(inputs, axis=1), ) + + outputs = transformer.norm(inputs) + return (outputs, state, sequence_length, attns) diff --git a/modelscope/models/audio/tts/am/models/transformer.py b/modelscope/models/audio/tts/am/models/transformer.py new file mode 100755 index 00000000..a9f0bedc --- /dev/null +++ b/modelscope/models/audio/tts/am/models/transformer.py @@ -0,0 +1,1157 @@ +"""Define layers related to the Google's Transformer model.""" + +import tensorflow as tf + +from . import compat, fsmn + + +def tile_sequence_length(sequence_length, num_heads): + """Tiles lengths :obj:`num_heads` times. + + Args: + sequence_length: The sequence length. + num_heads: The number of heads. + + Returns: + A ``tf.Tensor`` where each length is replicated :obj:`num_heads` times. + """ + sequence_length = tf.tile(sequence_length, [num_heads]) + sequence_length = tf.reshape(sequence_length, [num_heads, -1]) + sequence_length = tf.transpose(sequence_length, perm=[1, 0]) + sequence_length = tf.reshape(sequence_length, [-1]) + return sequence_length + + +def build_sequence_mask(sequence_length, + num_heads=None, + maximum_length=None, + dtype=tf.float32): + """Builds the dot product mask. + + Args: + sequence_length: The sequence length. + num_heads: The number of heads. + maximum_length: Optional size of the returned time dimension. Otherwise + it is the maximum of :obj:`sequence_length`. + dtype: The type of the mask tensor. + + Returns: + A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape + ``[batch_size, 1, 1, max_length]``. + """ + mask = tf.sequence_mask( + sequence_length, maxlen=maximum_length, dtype=dtype) + mask = tf.expand_dims(mask, axis=1) + if num_heads is not None: + mask = tf.expand_dims(mask, axis=1) + return mask + + +def build_sequence_mask_window(sequence_length, + left_window_size=-1, + right_window_size=-1, + num_heads=None, + maximum_length=None, + dtype=tf.float32): + """Builds the dot product mask. + + Args: + sequence_length: The sequence length. + num_heads: The number of heads. + maximum_length: Optional size of the returned time dimension. Otherwise + it is the maximum of :obj:`sequence_length`. + dtype: The type of the mask tensor. + + Returns: + A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape + ``[batch_size, 1, 1, max_length]``. + """ + sequence_mask = tf.sequence_mask( + sequence_length, maxlen=maximum_length, dtype=dtype) + mask = _window_mask( + sequence_length, + left_window_size=left_window_size, + right_window_size=right_window_size, + maximum_length=maximum_length, + dtype=dtype) + mask *= tf.expand_dims(sequence_mask, axis=1) + if num_heads is not None: + mask = tf.expand_dims(mask, axis=1) + return mask + + +def _lower_triangle_mask(sequence_length, + maximum_length=None, + dtype=tf.float32, + band=-1): + batch_size = tf.shape(sequence_length)[0] + if maximum_length is None: + maximum_length = tf.reduce_max(sequence_length) + mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype) + mask = compat.tf_compat( + v2='linalg.band_part', v1='matrix_band_part')(mask, band, 0) + return mask + + +def _higher_triangle_mask(sequence_length, + maximum_length=None, + dtype=tf.float32, + band=-1): + batch_size = tf.shape(sequence_length)[0] + if maximum_length is None: + maximum_length = tf.reduce_max(sequence_length) + mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype) + mask = compat.tf_compat( + v2='linalg.band_part', v1='matrix_band_part')(mask, 0, band) + return mask + + +def _window_mask(sequence_length, + left_window_size=-1, + right_window_size=-1, + maximum_length=None, + dtype=tf.float32): + batch_size = tf.shape(sequence_length)[0] + if maximum_length is None: + maximum_length = tf.reduce_max(sequence_length) + mask = tf.ones([batch_size, maximum_length, maximum_length], dtype=dtype) + left_window_size = tf.minimum( + tf.cast(left_window_size, tf.int64), + tf.cast(maximum_length - 1, tf.int64)) + right_window_size = tf.minimum( + tf.cast(right_window_size, tf.int64), + tf.cast(maximum_length - 1, tf.int64)) + mask = tf.matrix_band_part(mask, left_window_size, right_window_size) + return mask + + +def build_future_mask(sequence_length, + num_heads=None, + maximum_length=None, + dtype=tf.float32, + band=-1): + """Builds the dot product mask for future positions. + + Args: + sequence_length: The sequence length. + num_heads: The number of heads. + maximum_length: Optional size of the returned time dimension. Otherwise + it is the maximum of :obj:`sequence_length`. + dtype: The type of the mask tensor. + + Returns: + A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape + ``[batch_size, 1, max_length, max_length]``. + """ + sequence_mask = tf.sequence_mask( + sequence_length, maxlen=maximum_length, dtype=dtype) + mask = _lower_triangle_mask( + sequence_length, maximum_length=maximum_length, dtype=dtype, band=band) + mask *= tf.expand_dims(sequence_mask, axis=1) + if num_heads is not None: + mask = tf.expand_dims(mask, axis=1) + return mask + + +def build_history_mask(sequence_length, + num_heads=None, + maximum_length=None, + dtype=tf.float32, + band=-1): + """Builds the dot product mask for future positions. + + Args: + sequence_length: The sequence length. + num_heads: The number of heads. + maximum_length: Optional size of the returned time dimension. Otherwise + it is the maximum of :obj:`sequence_length`. + dtype: The type of the mask tensor. + + Returns: + A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape + ``[batch_size, 1, max_length, max_length]``. + """ + sequence_mask = tf.sequence_mask( + sequence_length, maxlen=maximum_length, dtype=dtype) + mask = _higher_triangle_mask( + sequence_length, maximum_length=maximum_length, dtype=dtype, band=band) + mask *= tf.expand_dims(sequence_mask, axis=1) + if num_heads is not None: + mask = tf.expand_dims(mask, axis=1) + return mask + + +def cumulative_average_mask(sequence_length, + maximum_length=None, + dtype=tf.float32): + """Builds the mask to compute the cumulative average as described in + https://arxiv.org/abs/1805.00631. + + Args: + sequence_length: The sequence length. + maximum_length: Optional size of the returned time dimension. Otherwise + it is the maximum of :obj:`sequence_length`. + dtype: The type of the mask tensor. + + Returns: + A ``tf.Tensor`` of type :obj:`dtype` and shape + ``[batch_size, max_length, max_length]``. + """ + sequence_mask = tf.sequence_mask( + sequence_length, maxlen=maximum_length, dtype=dtype) + mask = _lower_triangle_mask( + sequence_length, maximum_length=maximum_length, dtype=dtype) + mask *= tf.expand_dims(sequence_mask, axis=2) + weight = tf.range(1, tf.cast(tf.shape(mask)[1] + 1, dtype), dtype=dtype) + mask /= tf.expand_dims(weight, 1) + return mask + + +def cumulative_average(inputs, mask_or_step, cache=None): + """Computes the cumulative average as described in + https://arxiv.org/abs/1805.00631. + + Args: + inputs: The sequence to average. A tensor of shape :math:`[B, T, D]`. + mask_or_step: If :obj:`cache` is set, this is assumed to be the current step + of the dynamic decoding. Otherwise, it is the mask matrix used to compute + the cumulative average. + cache: A dictionnary containing the cumulative average of the previous step. + + Returns: + The cumulative average, a tensor of the same shape and type as :obj:`inputs`. + """ + if cache is not None: + step = tf.cast(mask_or_step, inputs.dtype) + aa = (inputs + step * cache['prev_g']) / (step + 1.0) + cache['prev_g'] = aa + return aa + else: + mask = mask_or_step + return tf.matmul(mask, inputs) + + +def fused_projection(inputs, num_units, num_outputs=1): + """Projects the same input into multiple output spaces. + + Args: + inputs: The inputs to project. + num_units: The number of output units of each space. + num_outputs: The number of output spaces. + + Returns: + :obj:`num_outputs` ``tf.Tensor`` of depth :obj:`num_units`. + """ + return tf.split( + tf.layers.conv1d(inputs, num_units * num_outputs, 1), + num_outputs, + axis=2) + + +def split_heads(inputs, num_heads): + """Splits a tensor in depth. + + Args: + inputs: A ``tf.Tensor`` of shape :math:`[B, T, D]`. + num_heads: The number of heads :math:`H`. + + Returns: + A ``tf.Tensor`` of shape :math:`[B, H, T, D / H]`. + """ + static_shape = inputs.get_shape().as_list() + depth = static_shape[-1] + outputs = tf.reshape(inputs, [ + tf.shape(inputs)[0], + tf.shape(inputs)[1], num_heads, depth // num_heads + ]) + outputs = tf.transpose(outputs, perm=[0, 2, 1, 3]) + return outputs + + +def combine_heads(inputs): + """Concatenates heads. + + Args: + inputs: A ``tf.Tensor`` of shape :math:`[B, H, T, D]`. + + Returns: + A ``tf.Tensor`` of shape :math:`[B, T, D * H]`. + """ + static_shape = inputs.get_shape().as_list() + depth = static_shape[-1] + num_heads = static_shape[1] + outputs = tf.transpose(inputs, perm=[0, 2, 1, 3]) + outputs = tf.reshape( + outputs, + [tf.shape(outputs)[0], + tf.shape(outputs)[1], depth * num_heads]) + return outputs + + +def dot_product_attention(queries, keys, values, mode, mask=None, dropout=0.0): + """Computes the dot product attention. + + Args: + queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. + keys: The sequence use to calculate attention scores. A tensor of shape + :math:`[B, T_2, ...]`. + values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. + mode: A ``tf.estimator.ModeKeys`` mode. + mask: A ``tf.Tensor`` applied to the dot product. + dropout: The probability to drop units from the inputs. + + Returns: + A tuple ``(context vector, attention vector)``. + """ + dot = tf.matmul(queries, keys, transpose_b=True) + + if mask is not None: + dot = tf.cast( + tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min), + dot.dtype) + + softmax = tf.nn.softmax(tf.cast(dot, tf.float32)) + attn = tf.cast(softmax, dot.dtype) + drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode) + + context = tf.matmul(drop_attn, values) + + return context, attn + + +def dot_product_attention_wpa(num_heads, + queries, + keys, + values, + mode, + attention_left_window=-1, + attention_right_window=0, + mask=None, + max_id_cache=None, + mono=False, + peak_delay=-1, + dropout=0.0): + """ + Computes the dot product attention. + Args: + queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. + keys: The sequence use to calculate attention scores. A tensor of shape + :math:`[B, T_2, ...]`. + values: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. + mode: A ``tf.estimator.ModeKeys`` mode. + mask: A ``tf.Tensor`` applied to the dot product. + dropout: The probability to drop units from the inputs. + + Returns: + A tuple ``(context vector, attention vector)``. + """ + # Dot product between queries and keys. + dot = tf.matmul(queries, keys, transpose_b=True) + depth = tf.shape(dot)[-1] + if mask is not None: + dot = tf.cast( + tf.cast(dot, tf.float32) * mask + ((1.0 - mask) * tf.float32.min), + dot.dtype) + # wpa + max_id = tf.math.argmax(input=dot, axis=-1) + # peak delay + if peak_delay > 0: + if max_id_cache is not None: + M = tf.cast(max_id_cache['pre_max_id'], dtype=max_id.dtype) + inputs_len = tf.math.minimum( + M + peak_delay, tf.cast(depth - 1, dtype=max_id.dtype)) + delay_mask = tf.sequence_mask( + inputs_len, maxlen=depth, dtype=tf.float32) + dot = tf.cast( + tf.cast(dot, tf.float32) * delay_mask + + ((1.0 - delay_mask) * tf.float32.min), dot.dtype) # yapf:disable + max_id = tf.math.argmax(input=dot, axis=-1) + # mono + if mono: + if max_id_cache is None: + d = tf.shape(max_id)[-1] + tmp_max_id = tf.reshape(max_id, [-1, num_heads, d]) + tmp_max_id = tf.slice( + tmp_max_id, [0, 0, 0], + [tf.shape(tmp_max_id)[0], + tf.shape(tmp_max_id)[1], d - 1]) + zeros = tf.zeros( + shape=(tf.shape(tmp_max_id)[0], tf.shape(tmp_max_id)[1], 1), + dtype=max_id.dtype) + tmp_max_id = tf.concat([zeros, tmp_max_id], axis=-1) + mask1 = tf.sequence_mask( + tmp_max_id, maxlen=depth, dtype=tf.float32) + dot = tf.cast( + tf.cast(dot, tf.float32) + * (1.0 - mask1) + mask1 * tf.float32.min, dot.dtype) # yapf:disable + max_id = tf.math.argmax(input=dot, axis=-1) + else: + # eval + tmp_max_id = tf.reshape(max_id, [-1, num_heads, 1]) + max_id_cache['pre_max_id'] = tmp_max_id + # right_mask + right_offset = tf.constant(attention_right_window, dtype=max_id.dtype) + right_len = tf.math.minimum(max_id + right_offset, + tf.cast(depth - 1, dtype=max_id.dtype)) + right_mask = tf.sequence_mask(right_len, maxlen=depth, dtype=tf.float32) + dot = tf.cast( + tf.cast(dot, tf.float32) * right_mask + + ((1.0 - right_mask) * tf.float32.min), dot.dtype) # yapf:disable + # left_mask + if attention_left_window > 0: + left_offset = tf.constant(attention_left_window, dtype=max_id.dtype) + left_len = tf.math.maximum(max_id - left_offset, + tf.cast(0, dtype=max_id.dtype)) + left_mask = tf.sequence_mask(left_len, maxlen=depth, dtype=tf.float32) + dot = tf.cast( + tf.cast(dot, tf.float32) * (1.0 - left_mask) + + (left_mask * tf.float32.min), dot.dtype) # yapf:disable + # Compute attention weights. + attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype) + drop_attn = tf.layers.dropout(attn, rate=dropout, training=mode) + + # Compute attention context. + context = tf.matmul(drop_attn, values) + + return context, attn + + +def multi_head_attention(num_heads, + queries, + memory, + mode, + num_units=None, + mask=None, + cache=None, + dropout=0.0, + return_attention=False): + """Computes the multi-head attention as described in + https://arxiv.org/abs/1706.03762. + + Args: + num_heads: The number of attention heads. + queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. + memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. + If ``None``, computes self-attention. + mode: A ``tf.estimator.ModeKeys`` mode. + num_units: The number of hidden units. If not set, it is set to the input + dimension. + mask: A ``tf.Tensor`` applied to the dot product. + cache: A dictionary containing pre-projected keys and values. + dropout: The probability to drop units from the inputs. + return_attention: Return the attention head probabilities in addition to the + context. + + Returns: + The concatenated attention context of each head and the attention + probabilities (if :obj:`return_attention` is set). + """ + num_units = num_units or queries.get_shape().as_list()[-1] + + if num_units % num_heads != 0: + raise ValueError('Multi head attention requires that num_units is a' + ' multiple of {}'.format(num_heads)) + + if memory is None: + queries, keys, values = fused_projection( + queries, num_units, num_outputs=3) + + keys = split_heads(keys, num_heads) + values = split_heads(values, num_heads) + + if cache is not None: + keys = tf.concat([cache['self_keys'], keys], axis=2) + values = tf.concat([cache['self_values'], values], axis=2) + cache['self_keys'] = keys + cache['self_values'] = values + else: + queries = tf.layers.conv1d(queries, num_units, 1) + + if cache is not None: + + def _project_and_split(): + k, v = fused_projection(memory, num_units, num_outputs=2) + return split_heads(k, num_heads), split_heads(v, num_heads) + + keys, values = tf.cond( + tf.equal(tf.shape(cache['memory_keys'])[2], 0), + true_fn=_project_and_split, + false_fn=lambda: + (cache['memory_keys'], cache['memory_values'])) + cache['memory_keys'] = keys + cache['memory_values'] = values + else: + keys, values = fused_projection(memory, num_units, num_outputs=2) + keys = split_heads(keys, num_heads) + values = split_heads(values, num_heads) + + queries = split_heads(queries, num_heads) + queries *= (num_units // num_heads)**-0.5 + + heads, attn = dot_product_attention( + queries, keys, values, mode, mask=mask, dropout=dropout) + + # Concatenate all heads output. + combined = combine_heads(heads) + outputs = tf.layers.conv1d(combined, num_units, 1) + + if not return_attention: + return outputs + return outputs, attn + + +def multi_head_attention_PNCA(num_heads, + queries, + memory, + mode, + num_units=None, + mask=None, + mask_h=None, + cache=None, + cache_h=None, + dropout=0.0, + return_attention=False, + X_band_width=None, + layer_name='multi_head'): + """Computes the multi-head attention as described in + https://arxiv.org/abs/1706.03762. + + Args: + num_heads: The number of attention heads. + queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. + memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. + If ``None``, computes self-attention. + mode: A ``tf.estimator.ModeKeys`` mode. + num_units: The number of hidden units. If not set, it is set to the input + dimension. + mask: A ``tf.Tensor`` applied to the dot product. + cache: A dictionary containing pre-projected keys and values. + dropout: The probability to drop units from the inputs. + return_attention: Return the attention head probabilities in addition to the + context. + + Returns: + The concatenated attention context of each head and the attention + probabilities (if :obj:`return_attention` is set). + """ + num_units = num_units or queries.get_shape().as_list()[-1] + + if num_units % num_heads != 0: + raise ValueError('Multi head attention requires that num_units is a' + ' multiple of {}'.format(num_heads)) + + # X + queries, keys, values = fused_projection(queries, num_units, num_outputs=3) + + keys = split_heads(keys, num_heads) + values = split_heads(values, num_heads) + + if cache is not None: + keys = tf.concat([cache['self_keys'], keys], axis=2) + values = tf.concat([cache['self_values'], values], axis=2) + if X_band_width is not None: + keys_band = tf.cond( + tf.less(X_band_width, 0), lambda: keys, lambda: tf.cond( + tf.less(tf.shape(keys)[2], X_band_width), lambda: keys, + lambda: keys[:, :, -X_band_width:, :]) + ) # not support X_band_width == 0 + values_band = tf.cond( + tf.less(X_band_width, 0), lambda: values, lambda: tf.cond( + tf.less(tf.shape(values)[2], X_band_width), lambda: values, + lambda: values[:, :, -X_band_width:, :])) + cache['self_keys'] = keys_band + cache['self_values'] = values_band + else: + cache['self_keys'] = keys + cache['self_values'] = values + + queries = split_heads(queries, num_heads) + queries *= (num_units // num_heads)**-0.5 + + heads, attn = dot_product_attention( + queries, keys, values, mode, mask=mask, dropout=dropout) + + # Concatenate all heads output. + combined = combine_heads(heads) + outputs = tf.layers.conv1d(combined, num_units, 1) + + # H + if cache_h is not None: + + def _project_and_split(): + k, v = fused_projection(memory, num_units, num_outputs=2) + return split_heads(k, num_heads), split_heads(v, num_heads) + + keys_h, values_h = tf.cond( + tf.equal(tf.shape(cache_h['memory_keys'])[2], 0), + true_fn=_project_and_split, + false_fn=lambda: + (cache_h['memory_keys'], cache_h['memory_values'])) + cache_h['memory_keys'] = keys_h + cache_h['memory_values'] = values_h + else: + keys_h, values_h = fused_projection(memory, num_units, num_outputs=2) + keys_h = split_heads(keys_h, num_heads) + values_h = split_heads(values_h, num_heads) + + heads_h, attn_h = dot_product_attention( + queries, keys_h, values_h, mode, mask=mask_h, dropout=dropout) + + # Concatenate all heads output. + combined_h = combine_heads(heads_h) + outputs_h = tf.layers.conv1d(combined_h, num_units, 1) + + # ADD + outputs = outputs + outputs_h + + # RETURN + return outputs, attn, attn_h + + +def multi_head_attention_memory(num_heads, + queries, + memory, + mode, + num_memory=None, + num_units=None, + mask=None, + cache=None, + dropout=0.0, + return_attention=False): + """Computes the multi-head attention as described in + https://arxiv.org/abs/1706.03762. + + Args: + num_heads: The number of attention heads. + queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. + memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. + If ``None``, computes self-attention. + mode: A ``tf.estimator.ModeKeys`` mode. + num_units: The number of hidden units. If not set, it is set to the input + dimension. + mask: A ``tf.Tensor`` applied to the dot product. + cache: A dictionary containing pre-projected keys and values. + dropout: The probability to drop units from the inputs. + return_attention: Return the attention head probabilities in addition to the + context. + + Returns: + The concatenated attention context of each head and the attention + probabilities (if :obj:`return_attention` is set). + """ + num_units = num_units or queries.get_shape().as_list()[-1] + + if num_units % num_heads != 0: + raise ValueError('Multi head attention requires that num_units is a' + ' multiple of {}'.format(num_heads)) + + # PERSISTENT MEMORY + # key memory + if num_memory is not None: + key_m = tf.get_variable( + 'key_m', + shape=[num_memory, num_units], + initializer=tf.glorot_uniform_initializer(), + dtype=tf.float32) + # value memory + value_m = tf.get_variable( + 'value_m', + shape=[num_memory, num_units], + initializer=tf.glorot_uniform_initializer(), + dtype=tf.float32) + if memory is None: + queries, keys, values = fused_projection( + queries, num_units, num_outputs=3) + + # concat memory + if num_memory is not None: + key_m_expand = tf.tile( + tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1]) + value_m_expand = tf.tile( + tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1]) + keys = tf.concat([key_m_expand, keys], axis=1) + values = tf.concat([value_m_expand, values], axis=1) + + keys = split_heads(keys, num_heads) + values = split_heads(values, num_heads) + + if cache is not None: + keys = tf.concat([cache['self_keys'], keys], axis=2) + values = tf.concat([cache['self_values'], values], axis=2) + cache['self_keys'] = keys + cache['self_values'] = values + else: + queries = tf.layers.conv1d(queries, num_units, 1) + + if cache is not None: + + def _project_and_split(): + k, v = fused_projection(memory, num_units, num_outputs=2) + return split_heads(k, num_heads), split_heads(v, num_heads) + + keys, values = tf.cond( + tf.equal(tf.shape(cache['memory_keys'])[2], 0), + true_fn=_project_and_split, + false_fn=lambda: + (cache['memory_keys'], cache['memory_values'])) + cache['memory_keys'] = keys + cache['memory_values'] = values + else: + keys, values = fused_projection(memory, num_units, num_outputs=2) + keys = split_heads(keys, num_heads) + values = split_heads(values, num_heads) + + queries = split_heads(queries, num_heads) + queries *= (num_units // num_heads)**-0.5 + + heads, attn = dot_product_attention( + queries, keys, values, mode, mask=mask, dropout=dropout) + + # Concatenate all heads output. + combined = combine_heads(heads) + outputs = tf.layers.conv1d(combined, num_units, 1) + + if not return_attention: + return outputs + return outputs, attn + + +def Ci_Cd_Memory(num_heads, + queries, + mode, + filter_size=None, + num_memory=None, + num_units=None, + fsmn_mask=None, + san_mask=None, + cache=None, + shift=None, + dropout=0.0, + return_attention=False): + """ + Args: + num_heads: The number of attention heads. + queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. + memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. + If ``None``, computes self-attention. + mode: A ``tf.estimator.ModeKeys`` mode. + num_units: The number of hidden units. If not set, it is set to the input + dimension. + mask: A ``tf.Tensor`` applied to the dot product. + cache: A dictionary containing pre-projected keys and values. + dropout: The probability to drop units from the inputs. + return_attention: Return the attention head probabilities in addition to the + context. + + Returns: + The concatenated attention context of each head and the attention + probabilities (if :obj:`return_attention` is set). + """ + num_units = num_units or queries.get_shape().as_list()[-1] + + if num_units % num_heads != 0: + raise ValueError('Multi head attention requires that num_units is a' + ' multiple of {}'.format(num_heads)) + # PERSISTENT MEMORY + if num_memory is not None: + key_m = tf.get_variable( + 'key_m', + shape=[num_memory, num_units], + initializer=tf.glorot_uniform_initializer(), + dtype=tf.float32) + value_m = tf.get_variable( + 'value_m', + shape=[num_memory, num_units], + initializer=tf.glorot_uniform_initializer(), + dtype=tf.float32) + + queries, keys, values = fused_projection(queries, num_units, num_outputs=3) + # fsmn memory block + if shift is not None: + # encoder + fsmn_memory = fsmn.MemoryBlockV2( + values, + filter_size, + mode, + shift=shift, + mask=fsmn_mask, + dropout=dropout) + else: + # decoder + fsmn_memory = fsmn.UniMemoryBlock( + values, + filter_size, + mode, + cache=cache, + mask=fsmn_mask, + dropout=dropout) + + # concat persistent memory + if num_memory is not None: + key_m_expand = tf.tile( + tf.expand_dims(key_m, 0), [tf.shape(keys)[0], 1, 1]) + value_m_expand = tf.tile( + tf.expand_dims(value_m, 0), [tf.shape(values)[0], 1, 1]) + keys = tf.concat([key_m_expand, keys], axis=1) + values = tf.concat([value_m_expand, values], axis=1) + + keys = split_heads(keys, num_heads) + values = split_heads(values, num_heads) + + if cache is not None: + keys = tf.concat([cache['self_keys'], keys], axis=2) + values = tf.concat([cache['self_values'], values], axis=2) + cache['self_keys'] = keys + cache['self_values'] = values + + queries = split_heads(queries, num_heads) + queries *= (num_units // num_heads)**-0.5 + + heads, attn = dot_product_attention( + queries, keys, values, mode, mask=san_mask, dropout=dropout) + + # Concatenate all heads output. + combined = combine_heads(heads) + outputs = tf.layers.conv1d(combined, num_units, 1) + outputs = outputs + fsmn_memory + + if not return_attention: + return outputs + return outputs, attn + + +def multi_head_attention_wpa(num_heads, + queries, + memory, + mode, + attention_left_window=-1, + attention_right_window=0, + num_units=None, + mask=None, + cache=None, + max_id_cache=None, + dropout=0.0, + mono=False, + peak_delay=-1, + return_attention=False): + """Computes the multi-head attention as described in + https://arxiv.org/abs/1706.03762. + + Args: + num_heads: The number of attention heads. + queries: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. + memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. + If ``None``, computes self-attention. + mode: A ``tf.estimator.ModeKeys`` mode. + num_units: The number of hidden units. If not set, it is set to the input + dimension. + mask: A ``tf.Tensor`` applied to the dot product. + cache: A dictionary containing pre-projected keys and values. + dropout: The probability to drop units from the inputs. + return_attention: Return the attention head probabilities in addition to the + context. + + Returns: + The concatenated attention context of each head and the attention + probabilities (if :obj:`return_attention` is set). + """ + num_units = num_units or queries.get_shape().as_list()[-1] + + if num_units % num_heads != 0: + raise ValueError('Multi head attention requires that num_units is a' + ' multiple of {}'.format(num_heads)) + + if memory is None: + queries, keys, values = fused_projection( + queries, num_units, num_outputs=3) + + keys = split_heads(keys, num_heads) + values = split_heads(values, num_heads) + + if cache is not None: + keys = tf.concat([cache['self_keys'], keys], axis=2) + values = tf.concat([cache['self_values'], values], axis=2) + cache['self_keys'] = keys + cache['self_values'] = values + else: + queries = tf.layers.conv1d(queries, num_units, 1) + + if cache is not None: + + def _project_and_split(): + k, v = fused_projection(memory, num_units, num_outputs=2) + return split_heads(k, num_heads), split_heads(v, num_heads) + + keys, values = tf.cond( + tf.equal(tf.shape(cache['memory_keys'])[2], 0), + true_fn=_project_and_split, + false_fn=lambda: + (cache['memory_keys'], cache['memory_values'])) + cache['memory_keys'] = keys + cache['memory_values'] = values + else: + keys, values = fused_projection(memory, num_units, num_outputs=2) + keys = split_heads(keys, num_heads) + values = split_heads(values, num_heads) + + queries = split_heads(queries, num_heads) + queries *= (num_units // num_heads)**-0.5 + + heads, attn = dot_product_attention_wpa( + num_heads, + queries, + keys, + values, + mode, + attention_left_window=attention_left_window, + attention_right_window=attention_right_window, + mask=mask, + max_id_cache=max_id_cache, + mono=mono, + peak_delay=peak_delay, + dropout=dropout) + + # Concatenate all heads output. + combined = combine_heads(heads) + outputs = tf.layers.conv1d(combined, num_units, 1) + + if not return_attention: + return outputs + return outputs, attn + + +def feed_forward(x, inner_dim, mode, dropout=0.0, mask=None): + """Implements the Transformer's "Feed Forward" layer. + + .. math:: + + ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2 + + Args: + x: The input. + inner_dim: The number of units of the inner linear transformation. + mode: A ``tf.estimator.ModeKeys`` mode. + dropout: The probability to drop units from the inner transformation. + + Returns: + The transformed input. + """ + input_dim = x.get_shape().as_list()[-1] + + if mask is not None: + x = x * tf.expand_dims(mask, -1) + + inner = tf.layers.conv1d( + x, inner_dim, 3, padding='same', activation=tf.nn.relu) + + if mask is not None: + inner = inner * tf.expand_dims(mask, -1) + inner = tf.layers.dropout(inner, rate=dropout, training=mode) + outer = tf.layers.conv1d(inner, input_dim, 1) + + return outer + + +def feed_forward_ori(x, inner_dim, mode, dropout=0.0): + """Implements the Transformer's "Feed Forward" layer. + + .. math:: + + ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2 + + Args: + x: The input. + inner_dim: The number of units of the inner linear transformation. + mode: A ``tf.estimator.ModeKeys`` mode. + dropout: The probability to drop units from the inner transformation. + + Returns: + The transformed input. + """ + input_dim = x.get_shape().as_list()[-1] + + inner = tf.layers.conv1d(x, inner_dim, 1, activation=tf.nn.relu) + inner = tf.layers.dropout(inner, rate=dropout, training=mode) + outer = tf.layers.conv1d(inner, input_dim, 1) + + return outer + + +def norm(inputs): + """Layer normalizes :obj:`inputs`.""" + return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1) + + +def drop_and_add(inputs, outputs, mode, dropout=0.1): + """Drops units in the outputs and adds the previous values. + + Args: + inputs: The input of the previous layer. + outputs: The output of the previous layer. + mode: A ``tf.estimator.ModeKeys`` mode. + dropout: The probability to drop units in :obj:`outputs`. + + Returns: + The residual and normalized output. + """ + outputs = tf.layers.dropout(outputs, rate=dropout, training=mode) + + input_dim = inputs.get_shape().as_list()[-1] + output_dim = outputs.get_shape().as_list()[-1] + + if input_dim == output_dim: + outputs += inputs + return outputs + + +class FeedForwardNetwork(tf.keras.layers.Layer): + """Implements the Transformer's "Feed Forward" layer. + + .. math:: + + ffn(x) = max(0, x*W_1 + b_1)*W_2 + b_2 + + Note: + Object-oriented implementation for TensorFlow 2.0. + """ + + def __init__(self, + inner_dim, + output_dim, + dropout=0.1, + activation=tf.nn.relu, + **kwargs): + """Initializes this layer. + + Args: + inner_dim: The number of units of the inner linear transformation. + output_dim: The number of units of the ouput linear transformation. + dropout: The probability to drop units from the activation output. + activation: The activation function to apply between the two linear + transformations. + kwargs: Additional layer arguments. + """ + super(FeedForwardNetwork, self).__init__(**kwargs) + self.inner = tf.keras.layers.Dense( + inner_dim, activation=activation, name='inner') + self.outer = tf.keras.layers.Dense(output_dim, name='outer') + self.dropout = dropout + + def call(self, inputs, training=None): # pylint: disable=arguments-differ + """Runs the layer.""" + inner = self.inner(inputs) + inner = tf.layers.dropout(inner, self.dropout, training=training) + return self.outer(inner) + + +class MultiHeadAttention(tf.keras.layers.Layer): + """Computes the multi-head attention as described in + https://arxiv.org/abs/1706.03762. + + Note: + Object-oriented implementation for TensorFlow 2.0. + """ + + def __init__(self, + num_heads, + num_units, + dropout=0.1, + return_attention=False, + **kwargs): + """Initializes this layers. + + Args: + num_heads: The number of attention heads. + num_units: The number of hidden units. + dropout: The probability to drop units from the inputs. + return_attention: If ``True``, also return the attention weights of the + first head. + kwargs: Additional layer arguments. + """ + super(MultiHeadAttention, self).__init__(**kwargs) + if num_units % num_heads != 0: + raise ValueError( + 'Multi head attention requires that num_units is a' + ' multiple of %s' % num_heads) + self.num_heads = num_heads + self.num_units = num_units + self.linear_queries = tf.keras.layers.Dense( + num_units, name='linear_queries') + self.linear_keys = tf.keras.layers.Dense(num_units, name='linear_keys') + self.linear_values = tf.keras.layers.Dense( + num_units, name='linear_values') + self.linear_output = tf.keras.layers.Dense( + num_units, name='linear_output') + self.dropout = dropout + self.return_attention = return_attention + + def call(self, inputs, memory=None, mask=None, cache=None, training=None): # pylint: disable=arguments-differ + """Runs the layer. + + Args: + inputs: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. + memory: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. + If ``None``, computes self-attention. + mask: A ``tf.Tensor`` applied to the dot product. + cache: A dictionary containing pre-projected keys and values. + training: Run in training mode. + + Returns: + A tuple with the attention context, the updated cache and the attention + probabilities of the first head (if :obj:`return_attention` is ``True``). + """ + + def _compute_kv(x): + keys = self.linear_keys(x) + keys = split_heads(keys, self.num_heads) + values = self.linear_values(x) + values = split_heads(values, self.num_heads) + return keys, values + + # Compute queries. + queries = self.linear_queries(inputs) + queries = split_heads(queries, self.num_heads) + queries *= (self.num_units // self.num_heads)**-0.5 + + # Compute keys and values. + if memory is None: + keys, values = _compute_kv(inputs) + if cache: + keys = tf.concat([cache[0], keys], axis=2) + values = tf.concat([cache[1], values], axis=2) + else: + if cache: + if not self.linear_keys.built: + # Ensure that the variable names are not impacted by the tf.cond name + # scope if the layers have not already been built. + with tf.name_scope(self.linear_keys.name): + self.linear_keys.build(memory.shape) + with tf.name_scope(self.linear_values.name): + self.linear_values.build(memory.shape) + keys, values = tf.cond( + tf.equal(tf.shape(cache[0])[2], 0), + true_fn=lambda: _compute_kv(memory), + false_fn=lambda: cache) + else: + keys, values = _compute_kv(memory) + + cache = (keys, values) + + # Dot product attention. + dot = tf.matmul(queries, keys, transpose_b=True) + if mask is not None: + mask = tf.expand_dims(tf.cast(mask, tf.float32), + 1) # Broadcast on heads dimension. + dot = tf.cast( + tf.cast(dot, tf.float32) * mask + + ((1.0 - mask) * tf.float32.min), dot.dtype) # yapf:disable + attn = tf.cast(tf.nn.softmax(tf.cast(dot, tf.float32)), dot.dtype) + drop_attn = tf.layers.dropout(attn, self.dropout, training=training) + heads = tf.matmul(drop_attn, values) + + # Concatenate all heads output. + combined = combine_heads(heads) + outputs = self.linear_output(combined) + if self.return_attention: + return outputs, cache, attn + return outputs, cache diff --git a/modelscope/models/audio/tts/am/sambert_hifi_16k.py b/modelscope/models/audio/tts/am/sambert_hifi_16k.py new file mode 100644 index 00000000..2db9abc6 --- /dev/null +++ b/modelscope/models/audio/tts/am/sambert_hifi_16k.py @@ -0,0 +1,255 @@ +import io +import os +from typing import Any, Dict, Optional, Union + +import numpy as np +import tensorflow as tf +from sklearn.preprocessing import MultiLabelBinarizer + +from modelscope.models.base import Model +from modelscope.models.builder import MODELS +from modelscope.utils.constant import ModelFile, Tasks +from .models import create_model +from .text.symbols import load_symbols +from .text.symbols_dict import SymbolsDict + +__all__ = ['SambertNetHifi16k'] + + +def multi_label_symbol_to_sequence(my_classes, my_symbol): + one_hot = MultiLabelBinarizer(my_classes) + tokens = my_symbol.strip().split(' ') + sequences = [] + for token in tokens: + sequences.append(tuple(token.split('&'))) + # sequences.append(tuple(['~'])) # sequence length minus 1 to ignore EOS ~ + return one_hot.fit_transform(sequences) + + +@MODELS.register_module(Tasks.text_to_speech, module_name=r'sambert_hifi_16k') +class SambertNetHifi16k(Model): + + def __init__(self, + model_dir, + pitch_control_str='', + duration_control_str='', + energy_control_str='', + *args, + **kwargs): + tf.reset_default_graph() + local_ckpt_path = os.path.join(ModelFile.TF_CHECKPOINT_FOLDER, 'ckpt') + self._ckpt_path = os.path.join(model_dir, local_ckpt_path) + self._dict_path = os.path.join(model_dir, 'dicts') + self._hparams = tf.contrib.training.HParams(**kwargs) + values = self._hparams.values() + hp = [' {}:{}'.format(name, values[name]) for name in sorted(values)] + print('Hyperparameters:\n' + '\n'.join(hp)) + super().__init__(self._ckpt_path, *args, **kwargs) + model_name = 'robutrans' + self._lfeat_type_list = self._hparams.lfeat_type_list.strip().split( + ',') + sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols( + self._dict_path) + self._sy = sy + self._tone = tone + self._syllable_flag = syllable_flag + self._word_segment = word_segment + self._emo_category = emo_category + self._speaker = speaker + self._inputs_dim = dict() + for lfeat_type in self._lfeat_type_list: + if lfeat_type == 'sy': + self._inputs_dim[lfeat_type] = len(sy) + elif lfeat_type == 'tone': + self._inputs_dim[lfeat_type] = len(tone) + elif lfeat_type == 'syllable_flag': + self._inputs_dim[lfeat_type] = len(syllable_flag) + elif lfeat_type == 'word_segment': + self._inputs_dim[lfeat_type] = len(word_segment) + elif lfeat_type == 'emo_category': + self._inputs_dim[lfeat_type] = len(emo_category) + elif lfeat_type == 'speaker': + self._inputs_dim[lfeat_type] = len(speaker) + + self._symbols_dict = SymbolsDict(sy, tone, syllable_flag, word_segment, + emo_category, speaker, + self._inputs_dim, + self._lfeat_type_list) + dim_inputs = sum(self._inputs_dim.values( + )) - self._inputs_dim['speaker'] - self._inputs_dim['emo_category'] + inputs = tf.placeholder(tf.float32, [1, None, dim_inputs], 'inputs') + inputs_emotion = tf.placeholder( + tf.float32, [1, None, self._inputs_dim['emo_category']], + 'inputs_emotion') + inputs_speaker = tf.placeholder(tf.float32, + [1, None, self._inputs_dim['speaker']], + 'inputs_speaker') + + input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') + pitch_contours_scale = tf.placeholder(tf.float32, [1, None], + 'pitch_contours_scale') + energy_contours_scale = tf.placeholder(tf.float32, [1, None], + 'energy_contours_scale') + duration_scale = tf.placeholder(tf.float32, [1, None], + 'duration_scale') + + with tf.variable_scope('model') as _: + self._model = create_model(model_name, self._hparams) + self._model.initialize( + inputs, + inputs_emotion, + inputs_speaker, + input_lengths, + duration_scales=duration_scale, + pitch_scales=pitch_contours_scale, + energy_scales=energy_contours_scale) + self._mel_spec = self._model.mel_outputs[0] + self._duration_outputs = self._model.duration_outputs[0] + self._duration_outputs_ = self._model.duration_outputs_[0] + self._pitch_contour_outputs = self._model.pitch_contour_outputs[0] + self._energy_contour_outputs = self._model.energy_contour_outputs[ + 0] + self._embedded_inputs_emotion = self._model.embedded_inputs_emotion[ + 0] + self._embedding_fsmn_outputs = self._model.embedding_fsmn_outputs[ + 0] + self._encoder_outputs = self._model.encoder_outputs[0] + self._pitch_embeddings = self._model.pitch_embeddings[0] + self._energy_embeddings = self._model.energy_embeddings[0] + self._LR_outputs = self._model.LR_outputs[0] + self._postnet_fsmn_outputs = self._model.postnet_fsmn_outputs[0] + self._attention_h = self._model.attention_h + self._attention_x = self._model.attention_x + + print('Loading checkpoint: %s' % self._ckpt_path) + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + self._session = tf.Session(config=config) + self._session.run(tf.global_variables_initializer()) + + saver = tf.train.Saver() + saver.restore(self._session, self._ckpt_path) + + duration_cfg_lst = [] + if len(duration_control_str) != 0: + for item in duration_control_str.strip().split('|'): + percent, scale = item.lstrip('(').rstrip(')').split(',') + duration_cfg_lst.append((float(percent), float(scale))) + + self._duration_cfg_lst = duration_cfg_lst + + pitch_contours_cfg_lst = [] + if len(pitch_control_str) != 0: + for item in pitch_control_str.strip().split('|'): + percent, scale = item.lstrip('(').rstrip(')').split(',') + pitch_contours_cfg_lst.append( + (float(percent), float(scale))) + + self._pitch_contours_cfg_lst = pitch_contours_cfg_lst + + energy_contours_cfg_lst = [] + if len(energy_control_str) != 0: + for item in energy_control_str.strip().split('|'): + percent, scale = item.lstrip('(').rstrip(')').split(',') + energy_contours_cfg_lst.append( + (float(percent), float(scale))) + + self._energy_contours_cfg_lst = energy_contours_cfg_lst + + def forward(self, text): + cleaner_names = [x.strip() for x in self._hparams.cleaners.split(',')] + + lfeat_symbol = text.strip().split(' ') + lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list)) + for this_lfeat_symbol in lfeat_symbol: + this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split( + '$') + if len(this_lfeat_symbol) != len(self._lfeat_type_list): + raise Exception( + 'Length of this_lfeat_symbol in training data' + + ' is not equal to the length of lfeat_type_list, ' + + str(len(this_lfeat_symbol)) + ' VS. ' + + str(len(self._lfeat_type_list))) + index = 0 + while index < len(lfeat_symbol_separate): + lfeat_symbol_separate[index] = lfeat_symbol_separate[ + index] + this_lfeat_symbol[index] + ' ' + index = index + 1 + + index = 0 + lfeat_type = self._lfeat_type_list[index] + sequence = self._symbols_dict.symbol_to_sequence( + lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names) + sequence_array = np.asarray( + sequence[:-1], + dtype=np.int32) # sequence length minus 1 to ignore EOS ~ + inputs = np.eye( + self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] + index = index + 1 + while index < len(self._lfeat_type_list) - 2: + lfeat_type = self._lfeat_type_list[index] + sequence = self._symbols_dict.symbol_to_sequence( + lfeat_symbol_separate[index].strip(), lfeat_type, + cleaner_names) + sequence_array = np.asarray( + sequence[:-1], + dtype=np.int32) # sequence length minus 1 to ignore EOS ~ + inputs_temp = np.eye( + self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] + inputs = np.concatenate((inputs, inputs_temp), axis=1) + index = index + 1 + seq = inputs + + lfeat_type = 'emo_category' + inputs_emotion = multi_label_symbol_to_sequence( + self._emo_category, lfeat_symbol_separate[index].strip()) + # inputs_emotion = inputs_emotion * 1.5 + index = index + 1 + + lfeat_type = 'speaker' + inputs_speaker = multi_label_symbol_to_sequence( + self._speaker, lfeat_symbol_separate[index].strip()) + + duration_scale = np.ones((len(seq), ), dtype=np.float32) + start_idx = 0 + for (percent, scale) in self._duration_cfg_lst: + duration_scale[start_idx:start_idx + + int(percent * len(seq))] = scale + start_idx += int(percent * len(seq)) + + pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32) + start_idx = 0 + for (percent, scale) in self._pitch_contours_cfg_lst: + pitch_contours_scale[start_idx:start_idx + + int(percent * len(seq))] = scale + start_idx += int(percent * len(seq)) + + energy_contours_scale = np.ones((len(seq), ), dtype=np.float32) + start_idx = 0 + for (percent, scale) in self._energy_contours_cfg_lst: + energy_contours_scale[start_idx:start_idx + + int(percent * len(seq))] = scale + start_idx += int(percent * len(seq)) + + feed_dict = { + self._model.inputs: [np.asarray(seq, dtype=np.float32)], + self._model.inputs_emotion: + [np.asarray(inputs_emotion, dtype=np.float32)], + self._model.inputs_speaker: + [np.asarray(inputs_speaker, dtype=np.float32)], + self._model.input_lengths: + np.asarray([len(seq)], dtype=np.int32), + self._model.duration_scales: [duration_scale], + self._model.pitch_scales: [pitch_contours_scale], + self._model.energy_scales: [energy_contours_scale] + } + + result = self._session.run([ + self._mel_spec, self._duration_outputs, self._duration_outputs_, + self._pitch_contour_outputs, self._embedded_inputs_emotion, + self._embedding_fsmn_outputs, self._encoder_outputs, + self._pitch_embeddings, self._LR_outputs, + self._postnet_fsmn_outputs, self._energy_contour_outputs, + self._energy_embeddings, self._attention_x, self._attention_h + ], feed_dict=feed_dict) # yapf:disable + return result[0] diff --git a/modelscope/models/audio/tts/am/text/__init__.py b/modelscope/models/audio/tts/am/text/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/modelscope/models/audio/tts/am/text/cleaners.py b/modelscope/models/audio/tts/am/text/cleaners.py new file mode 100755 index 00000000..19d838d1 --- /dev/null +++ b/modelscope/models/audio/tts/am/text/cleaners.py @@ -0,0 +1,89 @@ +''' +Cleaners are transformations that run over the input text at both training and eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +hyperparameter. Some cleaners are English-specific. You'll typically want to use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + the symbols in symbols.py to match your data). +''' + +import re + +from unidecode import unidecode + +from .numbers import normalize_numbers + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r'\s+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) + for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), ]] # yapf:disable + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text diff --git a/modelscope/models/audio/tts/am/text/cmudict.py b/modelscope/models/audio/tts/am/text/cmudict.py new file mode 100755 index 00000000..b4da4be9 --- /dev/null +++ b/modelscope/models/audio/tts/am/text/cmudict.py @@ -0,0 +1,64 @@ +import re + +valid_symbols = [ + 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', + 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', + 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', + 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', + 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', + 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', + 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', + 'Y', 'Z', 'ZH' +] + +_valid_symbol_set = set(valid_symbols) + + +class CMUDict: + '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' + + def __init__(self, file_or_path, keep_ambiguous=True): + if isinstance(file_or_path, str): + with open(file_or_path, encoding='latin-1') as f: + entries = _parse_cmudict(f) + else: + entries = _parse_cmudict(file_or_path) + if not keep_ambiguous: + entries = { + word: pron + for word, pron in entries.items() if len(pron) == 1 + } + self._entries = entries + + def __len__(self): + return len(self._entries) + + def lookup(self, word): + '''Returns list of ARPAbet pronunciations of the given word.''' + return self._entries.get(word.upper()) + + +_alt_re = re.compile(r'\([0-9]+\)') + + +def _parse_cmudict(file): + cmudict = {} + for line in file: + if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): + parts = line.split(' ') + word = re.sub(_alt_re, '', parts[0]) + pronunciation = _get_pronunciation(parts[1]) + if pronunciation: + if word in cmudict: + cmudict[word].append(pronunciation) + else: + cmudict[word] = [pronunciation] + return cmudict + + +def _get_pronunciation(s): + parts = s.strip().split(' ') + for part in parts: + if part not in _valid_symbol_set: + return None + return ' '.join(parts) diff --git a/modelscope/models/audio/tts/am/text/numbers.py b/modelscope/models/audio/tts/am/text/numbers.py new file mode 100755 index 00000000..d9453fee --- /dev/null +++ b/modelscope/models/audio/tts/am/text/numbers.py @@ -0,0 +1,70 @@ +import re + +import inflect + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words( + num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text diff --git a/modelscope/models/audio/tts/am/text/symbols.py b/modelscope/models/audio/tts/am/text/symbols.py new file mode 100644 index 00000000..a7715cca --- /dev/null +++ b/modelscope/models/audio/tts/am/text/symbols.py @@ -0,0 +1,95 @@ +''' +Defines the set of symbols used in text input to the model. + +The default is a set of ASCII characters that works well for English or text that has been run +through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. +''' +import codecs +import os + +_pad = '_' +_eos = '~' +_mask = '@[MASK]' + + +def load_symbols(dict_path): + _characters = '' + _ch_symbols = [] + sy_dict_name = 'sy_dict.txt' + sy_dict_path = os.path.join(dict_path, sy_dict_name) + f = codecs.open(sy_dict_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_symbols.append(line) + + _arpabet = ['@' + s for s in _ch_symbols] + + # Export all symbols: + sy = list(_characters) + _arpabet + [_pad, _eos, _mask] + + _characters = '' + + _ch_tones = [] + tone_dict_name = 'tone_dict.txt' + tone_dict_path = os.path.join(dict_path, tone_dict_name) + f = codecs.open(tone_dict_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_tones.append(line) + + # Export all tones: + tone = list(_characters) + _ch_tones + [_pad, _eos, _mask] + + _characters = '' + + _ch_syllable_flags = [] + syllable_flag_name = 'syllable_flag_dict.txt' + syllable_flag_path = os.path.join(dict_path, syllable_flag_name) + f = codecs.open(syllable_flag_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_syllable_flags.append(line) + + # Export all syllable_flags: + syllable_flag = list(_characters) + _ch_syllable_flags + [ + _pad, _eos, _mask + ] + + _characters = '' + + _ch_word_segments = [] + word_segment_name = 'word_segment_dict.txt' + word_segment_path = os.path.join(dict_path, word_segment_name) + f = codecs.open(word_segment_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_word_segments.append(line) + + # Export all syllable_flags: + word_segment = list(_characters) + _ch_word_segments + [_pad, _eos, _mask] + + _characters = '' + + _ch_emo_types = [] + emo_category_name = 'emo_category_dict.txt' + emo_category_path = os.path.join(dict_path, emo_category_name) + f = codecs.open(emo_category_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_emo_types.append(line) + + emo_category = list(_characters) + _ch_emo_types + [_pad, _eos, _mask] + + _characters = '' + + _ch_speakers = [] + speaker_name = 'speaker_dict.txt' + speaker_path = os.path.join(dict_path, speaker_name) + f = codecs.open(speaker_path, 'r') + for line in f: + line = line.strip('\r\n') + _ch_speakers.append(line) + + # Export all syllable_flags: + speaker = list(_characters) + _ch_speakers + [_pad, _eos, _mask] + return sy, tone, syllable_flag, word_segment, emo_category, speaker diff --git a/modelscope/models/audio/tts/am/text/symbols_dict.py b/modelscope/models/audio/tts/am/text/symbols_dict.py new file mode 100644 index 00000000..e8f7ed19 --- /dev/null +++ b/modelscope/models/audio/tts/am/text/symbols_dict.py @@ -0,0 +1,200 @@ +import re +import sys + +from .cleaners import (basic_cleaners, english_cleaners, + transliteration_cleaners) + + +class SymbolsDict: + + def __init__(self, sy, tone, syllable_flag, word_segment, emo_category, + speaker, inputs_dim, lfeat_type_list): + self._inputs_dim = inputs_dim + self._lfeat_type_list = lfeat_type_list + self._sy_to_id = {s: i for i, s in enumerate(sy)} + self._id_to_sy = {i: s for i, s in enumerate(sy)} + self._tone_to_id = {s: i for i, s in enumerate(tone)} + self._id_to_tone = {i: s for i, s in enumerate(tone)} + self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)} + self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)} + self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)} + self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)} + self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)} + self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)} + self._speaker_to_id = {s: i for i, s in enumerate(speaker)} + self._id_to_speaker = {i: s for i, s in enumerate(speaker)} + print('_sy_to_id: ') + print(self._sy_to_id) + print('_tone_to_id: ') + print(self._tone_to_id) + print('_syllable_flag_to_id: ') + print(self._syllable_flag_to_id) + print('_word_segment_to_id: ') + print(self._word_segment_to_id) + print('_emo_category_to_id: ') + print(self._emo_category_to_id) + print('_speaker_to_id: ') + print(self._speaker_to_id) + self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') + self._cleaners = { + basic_cleaners.__name__: basic_cleaners, + transliteration_cleaners.__name__: transliteration_cleaners, + english_cleaners.__name__: english_cleaners + } + + def _clean_text(self, text, cleaner_names): + for name in cleaner_names: + cleaner = self._cleaners.get(name) + if not cleaner: + raise Exception('Unknown cleaner: %s' % name) + text = cleaner(text) + return text + + def _sy_to_sequence(self, sy): + return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)] + + def _arpabet_to_sequence(self, text): + return self._sy_to_sequence(['@' + s for s in text.split()]) + + def _should_keep_sy(self, s): + return s in self._sy_to_id and s != '_' and s != '~' + + def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names): + sequence = [] + if lfeat_type == 'sy': + this_lfeat_symbol = this_lfeat_symbol.strip().split(' ') + this_lfeat_symbol_format = '' + index = 0 + while index < len(this_lfeat_symbol): + this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[ + index] + '}' + ' ' + index = index + 1 + sequence = self.text_to_sequence(this_lfeat_symbol_format, + cleaner_names) + elif lfeat_type == 'tone': + sequence = self.tone_to_sequence(this_lfeat_symbol) + elif lfeat_type == 'syllable_flag': + sequence = self.syllable_flag_to_sequence(this_lfeat_symbol) + elif lfeat_type == 'word_segment': + sequence = self.word_segment_to_sequence(this_lfeat_symbol) + elif lfeat_type == 'emo_category': + sequence = self.emo_category_to_sequence(this_lfeat_symbol) + elif lfeat_type == 'speaker': + sequence = self.speaker_to_sequence(this_lfeat_symbol) + else: + raise Exception('Unknown lfeat type: %s' % lfeat_type) + + return sequence + + def text_to_sequence(self, text, cleaner_names): + '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + + The text can optionally have ARPAbet sequences enclosed in curly braces embedded + in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." + + Args: + text: string to convert to a sequence + cleaner_names: names of the cleaner functions to run the text through + + Returns: + List of integers corresponding to the symbols in the text + ''' + sequence = [] + + # Check for curly braces and treat their contents as ARPAbet: + while len(text): + m = self._curly_re.match(text) + if not m: + sequence += self._sy_to_sequence( + self._clean_text(text, cleaner_names)) + break + sequence += self._sy_to_sequence( + self._clean_text(m.group(1), cleaner_names)) + sequence += self._arpabet_to_sequence(m.group(2)) + text = m.group(3) + + # Append EOS token + sequence.append(self._sy_to_id['~']) + return sequence + + def tone_to_sequence(self, tone): + tones = tone.strip().split(' ') + sequence = [] + for this_tone in tones: + sequence.append(self._tone_to_id[this_tone]) + sequence.append(self._tone_to_id['~']) + return sequence + + def syllable_flag_to_sequence(self, syllable_flag): + syllable_flags = syllable_flag.strip().split(' ') + sequence = [] + for this_syllable_flag in syllable_flags: + sequence.append(self._syllable_flag_to_id[this_syllable_flag]) + sequence.append(self._syllable_flag_to_id['~']) + return sequence + + def word_segment_to_sequence(self, word_segment): + word_segments = word_segment.strip().split(' ') + sequence = [] + for this_word_segment in word_segments: + sequence.append(self._word_segment_to_id[this_word_segment]) + sequence.append(self._word_segment_to_id['~']) + return sequence + + def emo_category_to_sequence(self, emo_type): + emo_categories = emo_type.strip().split(' ') + sequence = [] + for this_category in emo_categories: + sequence.append(self._emo_category_to_id[this_category]) + sequence.append(self._emo_category_to_id['~']) + return sequence + + def speaker_to_sequence(self, speaker): + speakers = speaker.strip().split(' ') + sequence = [] + for this_speaker in speakers: + sequence.append(self._speaker_to_id[this_speaker]) + sequence.append(self._speaker_to_id['~']) + return sequence + + def sequence_to_symbol(self, sequence): + result = '' + pre_lfeat_dim = 0 + for lfeat_type in self._lfeat_type_list: + current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim + + self._inputs_dim[lfeat_type]] + current_sequence = current_one_hot_sequence.argmax(1) + length = current_sequence.shape[0] + + index = 0 + while index < length: + this_sequence = current_sequence[index] + s = '' + if lfeat_type == 'sy': + s = self._id_to_sy[this_sequence] + if len(s) > 1 and s[0] == '@': + s = s[1:] + elif lfeat_type == 'tone': + s = self._id_to_tone[this_sequence] + elif lfeat_type == 'syllable_flag': + s = self._id_to_syllable_flag[this_sequence] + elif lfeat_type == 'word_segment': + s = self._id_to_word_segment[this_sequence] + elif lfeat_type == 'emo_category': + s = self._id_to_emo_category[this_sequence] + elif lfeat_type == 'speaker': + s = self._id_to_speaker[this_sequence] + else: + raise Exception('Unknown lfeat type: %s' % lfeat_type) + + if index == 0: + result = result + lfeat_type + ': ' + + result = result + '{' + s + '}' + + if index == length - 1: + result = result + '; ' + + index = index + 1 + pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type] + return result diff --git a/modelscope/models/audio/tts/frontend/__init__.py b/modelscope/models/audio/tts/frontend/__init__.py new file mode 100644 index 00000000..d7b1015d --- /dev/null +++ b/modelscope/models/audio/tts/frontend/__init__.py @@ -0,0 +1 @@ +from .generic_text_to_speech_frontend import * # noqa F403 diff --git a/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py b/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py new file mode 100644 index 00000000..c6aabf75 --- /dev/null +++ b/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py @@ -0,0 +1,39 @@ +import os +import zipfile +from typing import Any, Dict, List + +from modelscope.models.base import Model +from modelscope.models.builder import MODELS +from modelscope.utils.audio.tts_exceptions import ( + TtsFrontendInitializeFailedException, + TtsFrontendLanguageTypeInvalidException) +from modelscope.utils.constant import Tasks + +__all__ = ['GenericTtsFrontend'] + + +@MODELS.register_module( + Tasks.text_to_speech, module_name=r'generic_tts_frontend') +class GenericTtsFrontend(Model): + + def __init__(self, model_dir='.', lang_type='pinyin', *args, **kwargs): + super().__init__(model_dir, *args, **kwargs) + import ttsfrd + + frontend = ttsfrd.TtsFrontendEngine() + zip_file = os.path.join(model_dir, 'resource.zip') + self._res_path = os.path.join(model_dir, 'resource') + with zipfile.ZipFile(zip_file, 'r') as zip_ref: + zip_ref.extractall(model_dir) + if not frontend.initialize(self._res_path): + raise TtsFrontendInitializeFailedException( + 'resource invalid: {}'.format(self._res_path)) + if not frontend.set_lang_type(lang_type): + raise TtsFrontendLanguageTypeInvalidException( + 'language type invalid: {}, valid is pinyin and chenmix'. + format(lang_type)) + self._frontend = frontend + + def forward(self, data: str) -> Dict[str, List]: + result = self._frontend.gen_tacotron_symbols(data) + return {'texts': [s for s in result.splitlines() if s != '']} diff --git a/modelscope/models/audio/tts/vocoder/__init__.py b/modelscope/models/audio/tts/vocoder/__init__.py new file mode 100644 index 00000000..94f257f8 --- /dev/null +++ b/modelscope/models/audio/tts/vocoder/__init__.py @@ -0,0 +1 @@ +from .hifigan16k import * # noqa F403 diff --git a/modelscope/models/audio/tts/vocoder/hifigan16k.py b/modelscope/models/audio/tts/vocoder/hifigan16k.py new file mode 100644 index 00000000..0d917dbe --- /dev/null +++ b/modelscope/models/audio/tts/vocoder/hifigan16k.py @@ -0,0 +1,73 @@ +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import argparse +import glob +import os +import time + +import json +import numpy as np +import torch +from scipy.io.wavfile import write + +from modelscope.models.base import Model +from modelscope.models.builder import MODELS +from modelscope.utils.audio.tts_exceptions import \ + TtsVocoderMelspecShapeMismatchException +from modelscope.utils.constant import ModelFile, Tasks +from .models import Generator + +__all__ = ['Hifigan16k', 'AttrDict'] +MAX_WAV_VALUE = 32768.0 + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print('Complete.') + return checkpoint_dict + + +class AttrDict(dict): + + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +@MODELS.register_module(Tasks.text_to_speech, module_name=r'hifigan16k') +class Hifigan16k(Model): + + def __init__(self, model_dir, *args, **kwargs): + self._ckpt_path = os.path.join(model_dir, + ModelFile.TORCH_MODEL_BIN_FILE) + self._config = AttrDict(**kwargs) + + super().__init__(self._ckpt_path, *args, **kwargs) + if torch.cuda.is_available(): + torch.manual_seed(self._config.seed) + self._device = torch.device('cuda') + else: + self._device = torch.device('cpu') + self._generator = Generator(self._config).to(self._device) + state_dict_g = load_checkpoint(self._ckpt_path, self._device) + self._generator.load_state_dict(state_dict_g['generator']) + self._generator.eval() + self._generator.remove_weight_norm() + + def forward(self, melspec): + dim0 = list(melspec.shape)[-1] + if dim0 != 80: + raise TtsVocoderMelspecShapeMismatchException( + 'input melspec mismatch 0 dim require 80 but {}'.format(dim0)) + with torch.no_grad(): + x = melspec.T + x = torch.FloatTensor(x).to(self._device) + if len(x.shape) == 2: + x = x.unsqueeze(0) + y_g_hat = self._generator(x) + audio = y_g_hat.squeeze() + audio = audio * MAX_WAV_VALUE + audio = audio.cpu().numpy().astype('int16') + return audio diff --git a/modelscope/models/audio/tts/vocoder/models/__init__.py b/modelscope/models/audio/tts/vocoder/models/__init__.py new file mode 100644 index 00000000..b00eec9b --- /dev/null +++ b/modelscope/models/audio/tts/vocoder/models/__init__.py @@ -0,0 +1 @@ +from .models import Generator diff --git a/modelscope/models/audio/tts/vocoder/models/models.py b/modelscope/models/audio/tts/vocoder/models/models.py new file mode 100755 index 00000000..c46a9204 --- /dev/null +++ b/modelscope/models/audio/tts/vocoder/models/models.py @@ -0,0 +1,516 @@ +from distutils.version import LooseVersion + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from .utils import get_padding, init_weights + +is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7') + + +def stft(x, fft_size, hop_size, win_length, window): + """Perform STFT and convert to magnitude spectrogram. + + Args: + x (Tensor): Input signal tensor (B, T). + fft_size (int): FFT size. + hop_size (int): Hop size. + win_length (int): Window length. + window (str): Window function type. + + Returns: + Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). + + """ + if is_pytorch_17plus: + x_stft = torch.stft( + x, fft_size, hop_size, win_length, window, return_complex=False) + else: + x_stft = torch.stft(x, fft_size, hop_size, win_length, window) + real = x_stft[..., 0] + imag = x_stft[..., 1] + + # NOTE(kan-bayashi): clamp is needed to avoid nan or inf + return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) + + +LRELU_SLOPE = 0.1 + + +def get_padding_casual(kernel_size, dilation=1): + return int(kernel_size * dilation - dilation) + + +class Conv1dCasual(torch.nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode='zeros'): + super(Conv1dCasual, self).__init__() + self.pad = padding + self.conv1d = weight_norm( + Conv1d( + in_channels, + out_channels, + kernel_size, + stride, + padding=0, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode=padding_mode)) + self.conv1d.apply(init_weights) + + def forward(self, x): # bdt + # described starting from the last dimension and moving forward. + x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant') + x = self.conv1d(x) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.conv1d) + + +class ConvTranspose1dCausal(torch.nn.Module): + """CausalConvTranspose1d module with customized initialization.""" + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding=0): + """Initialize CausalConvTranspose1d module.""" + super(ConvTranspose1dCausal, self).__init__() + self.deconv = weight_norm( + ConvTranspose1d(in_channels, out_channels, kernel_size, stride)) + self.stride = stride + self.deconv.apply(init_weights) + self.pad = kernel_size - stride + + def forward(self, x): + """Calculate forward propagation. + Args: + x (Tensor): Input tensor (B, in_channels, T_in). + Returns: + Tensor: Output tensor (B, out_channels, T_out). + """ + # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant") + return self.deconv(x)[:, :, :-self.pad] + + def remove_weight_norm(self): + remove_weight_norm(self.deconv) + + +class ResBlock1(torch.nn.Module): + + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList([ + Conv1dCasual( + channels, + channels, + kernel_size, + 1, + dilation=dilation[i], + padding=get_padding_casual(kernel_size, dilation[i])) + for i in range(len(dilation)) + ]) + + self.convs2 = nn.ModuleList([ + Conv1dCasual( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding_casual(kernel_size, 1)) + for i in range(len(dilation)) + ]) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for layer in self.convs1: + layer.remove_weight_norm() + for layer in self.convs2: + layer.remove_weight_norm() + + +class Generator(torch.nn.Module): + + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + print('num_kernels={}, num_upsamples={}'.format( + self.num_kernels, self.num_upsamples)) + self.conv_pre = Conv1dCasual( + 80, h.upsample_initial_channel, 7, 1, padding=7 - 1) + resblock = ResBlock1 if h.resblock == '1' else ResBlock2 + + self.ups = nn.ModuleList() + self.repeat_ups = nn.ModuleList() + for i, (u, k) in enumerate( + zip(h.upsample_rates, h.upsample_kernel_sizes)): + upsample = nn.Sequential( + nn.Upsample(mode='nearest', scale_factor=u), + nn.LeakyReLU(LRELU_SLOPE), + Conv1dCasual( + h.upsample_initial_channel // (2**i), + h.upsample_initial_channel // (2**(i + 1)), + kernel_size=7, + stride=1, + padding=7 - 1)) + self.repeat_ups.append(upsample) + self.ups.append( + ConvTranspose1dCausal( + h.upsample_initial_channel // (2**i), + h.upsample_initial_channel // (2**(i + 1)), + k, + u, + padding=(k - u) // 2)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2**(i + 1)) + for j, (k, d) in enumerate( + zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = torch.sin(x) + x + # transconv + x1 = F.leaky_relu(x, LRELU_SLOPE) + x1 = self.ups[i](x1) + # repeat + x2 = self.repeat_ups[i](x) + x = x1 + x2 + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for layer in self.ups: + layer.remove_weight_norm() + for layer in self.repeat_ups: + layer[-1].remove_weight_norm() + for layer in self.resblocks: + layer.remove_weight_norm() + self.conv_pre.remove_weight_norm() + self.conv_post.remove_weight_norm() + + +class DiscriminatorP(torch.nn.Module): + + def __init__(self, + period, + kernel_size=5, + stride=3, + use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList([ + norm_f( + Conv2d( + 1, + 32, (kernel_size, 1), (stride, 1), + padding=(get_padding(5, 1), 0))), + norm_f( + Conv2d( + 32, + 128, (kernel_size, 1), (stride, 1), + padding=(get_padding(5, 1), 0))), + norm_f( + Conv2d( + 128, + 512, (kernel_size, 1), (stride, 1), + padding=(get_padding(5, 1), 0))), + norm_f( + Conv2d( + 512, + 1024, (kernel_size, 1), (stride, 1), + padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), 'reflect') + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for layer in self.convs: + x = layer(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + + def __init__(self): + super(MultiPeriodDiscriminator, self).__init__() + self.discriminators = nn.ModuleList([ + DiscriminatorP(2), + DiscriminatorP(3), + DiscriminatorP(5), + DiscriminatorP(7), + DiscriminatorP(11), + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for layer in self.convs: + x = layer(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.ModuleList([ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ]) + from pytorch_wavelets import DWT1DForward + self.meanpools = nn.ModuleList( + [DWT1DForward(wave='db3', J=1), + DWT1DForward(wave='db3', J=1)]) + self.convs = nn.ModuleList([ + weight_norm(Conv1d(2, 1, 15, 1, padding=7)), + weight_norm(Conv1d(2, 1, 15, 1, padding=7)) + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + yl, yh = self.meanpools[i - 1](y) + y = torch.cat([yl, yh[0]], dim=1) + y = self.convs[i - 1](y) + y = F.leaky_relu(y, LRELU_SLOPE) + + yl_hat, yh_hat = self.meanpools[i - 1](y_hat) + y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1) + y_hat = self.convs[i - 1](y_hat) + y_hat = F.leaky_relu(y_hat, LRELU_SLOPE) + + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorSTFT(torch.nn.Module): + + def __init__(self, + kernel_size=11, + stride=2, + use_spectral_norm=False, + fft_size=1024, + shift_size=120, + win_length=600, + window='hann_window'): + super(DiscriminatorSTFT, self).__init__() + self.fft_size = fft_size + self.shift_size = shift_size + self.win_length = win_length + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.convs = nn.ModuleList([ + norm_f( + Conv2d( + fft_size // 2 + 1, + 32, (15, 1), (1, 1), + padding=(get_padding(15, 1), 0))), + norm_f( + Conv2d( + 32, + 32, (kernel_size, 1), (stride, 1), + padding=(get_padding(9, 1), 0))), + norm_f( + Conv2d( + 32, + 32, (kernel_size, 1), (stride, 1), + padding=(get_padding(9, 1), 0))), + norm_f( + Conv2d( + 32, + 32, (kernel_size, 1), (stride, 1), + padding=(get_padding(9, 1), 0))), + norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0))) + self.register_buffer('window', getattr(torch, window)(win_length)) + + def forward(self, wav): + wav = torch.squeeze(wav, 1) + x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length, + self.window) + x = torch.transpose(x_mag, 2, 1).unsqueeze(-1) + fmap = [] + for layer in self.convs: + x = layer(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = x.squeeze(-1) + + return x, fmap + + +class MultiSTFTDiscriminator(torch.nn.Module): + + def __init__( + self, + fft_sizes=[1024, 2048, 512], + hop_sizes=[120, 240, 50], + win_lengths=[600, 1200, 240], + window='hann_window', + ): + super(MultiSTFTDiscriminator, self).__init__() + self.discriminators = nn.ModuleList() + for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): + self.discriminators += [ + DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl) + ] + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr)**2) + g_loss = torch.mean(dg**2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + temp_loss = torch.mean((1 - dg)**2) + gen_losses.append(temp_loss) + loss += temp_loss + + return loss, gen_losses diff --git a/modelscope/models/audio/tts/vocoder/models/utils.py b/modelscope/models/audio/tts/vocoder/models/utils.py new file mode 100755 index 00000000..03e1ef8c --- /dev/null +++ b/modelscope/models/audio/tts/vocoder/models/utils.py @@ -0,0 +1,59 @@ +import glob +import os + +import matplotlib +import matplotlib.pylab as plt +import torch +from torch.nn.utils import weight_norm + +matplotlib.use('Agg') + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow( + spectrogram, aspect='auto', origin='lower', interpolation='none') + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print('Complete.') + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print('Saving checkpoint to {}'.format(filepath)) + torch.save(obj, filepath) + print('Complete.') + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] diff --git a/modelscope/models/base.py b/modelscope/models/base.py index 88b1e3b0..ab0d22cc 100644 --- a/modelscope/models/base.py +++ b/modelscope/models/base.py @@ -62,4 +62,6 @@ class Model(ABC): if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'): model_cfg.type = model_cfg.model_type model_cfg.model_dir = local_model_dir + for k, v in kwargs.items(): + model_cfg.k = v return build_model(model_cfg, task_name) diff --git a/modelscope/models/multi_model/__init__.py b/modelscope/models/multi_model/__init__.py new file mode 100644 index 00000000..02e8d6ab --- /dev/null +++ b/modelscope/models/multi_model/__init__.py @@ -0,0 +1 @@ +from .image_captioning_model import OfaForImageCaptioning diff --git a/modelscope/models/multi_model/image_captioning_model.py b/modelscope/models/multi_model/image_captioning_model.py new file mode 100644 index 00000000..fad0663e --- /dev/null +++ b/modelscope/models/multi_model/image_captioning_model.py @@ -0,0 +1,80 @@ +import os.path as osp +from typing import Any, Dict + +from PIL import Image + +from modelscope.utils.constant import ModelFile, Tasks +from ..base import Model +from ..builder import MODELS + +__all__ = ['OfaForImageCaptioning'] + + +@MODELS.register_module( + Tasks.image_captioning, module_name=r'ofa-image-captioning') +class OfaForImageCaptioning(Model): + + def __init__(self, model_dir, *args, **kwargs): + super().__init__(model_dir=model_dir, *args, **kwargs) + ckpt_name = ModelFile.TORCH_MODEL_FILE + local_model = osp.join(model_dir, ckpt_name) + bpe_dir = model_dir + # turn on cuda if GPU is available + from fairseq import checkpoint_utils, tasks, utils + from ofa.tasks.mm_tasks import CaptionTask + from ofa.utils.eval_utils import eval_caption + self.eval_caption = eval_caption + + tasks.register_task('caption', CaptionTask) + use_cuda = kwargs['use_cuda'] if 'use_cuda' in kwargs else False + use_fp16 = kwargs[ + 'use_fp16'] if 'use_fp16' in kwargs and use_cuda else False + overrides = { + 'bpe_dir': bpe_dir, + 'eval_cider': False, + 'beam': 5, + 'max_len_b': 16, + 'no_repeat_ngram_size': 3, + 'seed': 7 + } + models, cfg, task = checkpoint_utils.load_model_ensemble_and_task( + utils.split_paths(local_model), arg_overrides=overrides) + + # Move models to GPU + for model in models: + model.eval() + if use_cuda: + model.cuda() + if use_fp16: + model.half() + model.prepare_for_inference_(cfg) + self.models = models + # Initialize generator + self.generator = task.build_generator(models, cfg.generation) + + # Initialize transform + from torchvision import transforms + mean = [0.5, 0.5, 0.5] + std = [0.5, 0.5, 0.5] + + self.patch_resize_transform = transforms.Compose([ + lambda image: image.convert('RGB'), + transforms.Resize( + (cfg.task.patch_image_size, cfg.task.patch_image_size), + interpolation=Image.BICUBIC), + transforms.ToTensor(), + transforms.Normalize(mean=mean, std=std), + ]) + self.task = task + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + results, _ = self.eval_caption(self.task, self.generator, self.models, + input) + return { + 'image_id': results[0]['image_id'], + 'caption': results[0]['caption'] + } + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + # What should we do here ? + return inputs diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 5801533b..801832ad 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -1,4 +1,5 @@ from .masked_language_model import * # noqa F403 -from .sentence_similarity_model import * # noqa F403 -from .sequence_classification_model import * # noqa F403 -from .text_generation_model import * # noqa F403 +from .bert_for_sequence_classification import * # noqa F403 +from .palm_for_text_generation import * # noqa F403 +from .sbert_for_sentence_similarity import * # noqa F403 +from .sbert_for_token_classification import * # noqa F403 diff --git a/modelscope/models/nlp/sequence_classification_model.py b/modelscope/models/nlp/bert_for_sequence_classification.py similarity index 100% rename from modelscope/models/nlp/sequence_classification_model.py rename to modelscope/models/nlp/bert_for_sequence_classification.py diff --git a/modelscope/models/nlp/palm_for_text_generation.py b/modelscope/models/nlp/palm_for_text_generation.py new file mode 100644 index 00000000..e5799feb --- /dev/null +++ b/modelscope/models/nlp/palm_for_text_generation.py @@ -0,0 +1,43 @@ +from typing import Dict + +from modelscope.utils.constant import Tasks +from ..base import Model, Tensor +from ..builder import MODELS + +__all__ = ['PalmForTextGeneration'] + + +@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0') +class PalmForTextGeneration(Model): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the text generation model from the `model_dir` path. + + Args: + model_dir (str): the model path. + model_cls (Optional[Any], optional): model loader, if None, use the + default loader to load model weights, by default None. + """ + super().__init__(model_dir, *args, **kwargs) + self.model_dir = model_dir + + from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator + model = PalmForConditionalGeneration.from_pretrained(model_dir) + self.tokenizer = model.tokenizer + self.generator = Translator(model) + + def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: + """return the result by the model + + Args: + input (Dict[str, Tensor]): the preprocessed data + + Returns: + Dict[str, Tensor]: results + Example: + { + 'predictions': Tensor([[1377, 4959, 2785, 6392...])]), # tokens need to be decode by tokenizer + } + """ + + return self.generator(**input) diff --git a/modelscope/models/nlp/sentence_similarity_model.py b/modelscope/models/nlp/sbert_for_sentence_similarity.py similarity index 100% rename from modelscope/models/nlp/sentence_similarity_model.py rename to modelscope/models/nlp/sbert_for_sentence_similarity.py diff --git a/modelscope/models/nlp/sbert_for_token_classification.py b/modelscope/models/nlp/sbert_for_token_classification.py new file mode 100644 index 00000000..b918dc37 --- /dev/null +++ b/modelscope/models/nlp/sbert_for_token_classification.py @@ -0,0 +1,56 @@ +from typing import Any, Dict, Union + +import numpy as np +import torch +from sofa import SbertConfig, SbertForTokenClassification + +from modelscope.utils.constant import Tasks +from ..base import Model, Tensor +from ..builder import MODELS + +__all__ = ['StructBertForTokenClassification'] + + +@MODELS.register_module( + Tasks.word_segmentation, + module_name=r'structbert-chinese-word-segmentation') +class StructBertForTokenClassification(Model): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the word segmentation model from the `model_dir` path. + + Args: + model_dir (str): the model path. + model_cls (Optional[Any], optional): model loader, if None, use the + default loader to load model weights, by default None. + """ + super().__init__(model_dir, *args, **kwargs) + self.model_dir = model_dir + self.model = SbertForTokenClassification.from_pretrained( + self.model_dir) + self.config = SbertConfig.from_pretrained(self.model_dir) + + def forward(self, input: Dict[str, + Any]) -> Dict[str, Union[str, np.ndarray]]: + """return the result by the model + + Args: + input (Dict[str, Any]): the preprocessed data + + Returns: + Dict[str, Union[str,np.ndarray]]: results + Example: + { + 'predictions': array([1,4]), # lable 0-negative 1-positive + 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value + 'text': str(今天), + } + """ + input_ids = torch.tensor(input['input_ids']).unsqueeze(0) + output = self.model(input_ids) + logits = output.logits + pred = torch.argmax(logits[0], dim=-1) + pred = pred.numpy() + + rst = {'predictions': pred, 'logits': logits, 'text': input['text']} + return rst diff --git a/modelscope/models/nlp/text_generation_model.py b/modelscope/models/nlp/text_generation_model.py deleted file mode 100644 index ebefc8d1..00000000 --- a/modelscope/models/nlp/text_generation_model.py +++ /dev/null @@ -1,52 +0,0 @@ -from typing import Any, Dict - -from modelscope.utils.constant import Tasks -from ..base import Model, Tensor -from ..builder import MODELS - -__all__ = ['PalmForTextGenerationModel'] - - -@MODELS.register_module(Tasks.text_generation, module_name=r'palm') -class PalmForTextGenerationModel(Model): - - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the text generation model from the `model_dir` path. - - Args: - model_dir (str): the model path. - model_cls (Optional[Any], optional): model loader, if None, use the - default loader to load model weights, by default None. - """ - from sofa import PalmTokenizer - - super().__init__(model_dir, *args, **kwargs) - self.model_dir = model_dir - - from sofa.models.palm import PalmForConditionalGeneration, TextGenerator - tokenizer = kwargs.pop('tokenizer', - PalmTokenizer.from_pretrained(model_dir)) - model = PalmForConditionalGeneration.from_pretrained(model_dir) - self.generator = TextGenerator(model, tokenizer) - - def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: - """return the result by the model - - Args: - input (Dict[str, Any]): the preprocessed data - - Returns: - Dict[str, np.ndarray]: results - Example: - { - 'predictions': array([1]), # lable 0-negative 1-positive - 'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32), - 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value - } - """ - - encoder_inputs = [ - input['input_ids'], input['token_type_ids'], - input['attention_mask'] - ] - return self.generator(encoder_inputs) diff --git a/modelscope/pipelines/__init__.py b/modelscope/pipelines/__init__.py index d47ce8cf..14865872 100644 --- a/modelscope/pipelines/__init__.py +++ b/modelscope/pipelines/__init__.py @@ -1,4 +1,4 @@ -from .audio import * # noqa F403 +from .audio import LinearAECPipeline from .base import Pipeline from .builder import pipeline from .cv import * # noqa F403 diff --git a/modelscope/pipelines/audio/__init__.py b/modelscope/pipelines/audio/__init__.py index e69de29b..20c7710a 100644 --- a/modelscope/pipelines/audio/__init__.py +++ b/modelscope/pipelines/audio/__init__.py @@ -0,0 +1,2 @@ +from .linear_aec_pipeline import LinearAECPipeline +from .text_to_speech_pipeline import * # noqa F403 diff --git a/modelscope/pipelines/audio/linear_aec_pipeline.py b/modelscope/pipelines/audio/linear_aec_pipeline.py new file mode 100644 index 00000000..528d8d47 --- /dev/null +++ b/modelscope/pipelines/audio/linear_aec_pipeline.py @@ -0,0 +1,160 @@ +import importlib +import os +from typing import Any, Dict + +import numpy as np +import scipy.io.wavfile as wav +import torch +import yaml + +from modelscope.preprocessors.audio import LinearAECAndFbank +from modelscope.utils.constant import ModelFile, Tasks +from ..base import Pipeline +from ..builder import PIPELINES + +FEATURE_MVN = 'feature.DEY.mvn.txt' + +CONFIG_YAML = 'dey_mini.yaml' + + +def initialize_config(module_cfg): + r"""According to config items, load specific module dynamically with params. + 1. Load the module corresponding to the "module" param. + 2. Call function (or instantiate class) corresponding to the "main" param. + 3. Send the param (in "args") into the function (or class) when calling ( or instantiating). + + Args: + module_cfg (dict): config items, eg: + { + "module": "models.model", + "main": "Model", + "args": {...} + } + + Returns: + the module loaded. + """ + module = importlib.import_module(module_cfg['module']) + return getattr(module, module_cfg['main'])(**module_cfg['args']) + + +@PIPELINES.register_module( + Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k') +class LinearAECPipeline(Pipeline): + r"""AEC Inference Pipeline only support 16000 sample rate. + + When invoke the class with pipeline.__call__(), you should provide two params: + Dict[str, Any] + the path of wav files,eg:{ + "nearend_mic": "/your/data/near_end_mic_audio.wav", + "farend_speech": "/your/data/far_end_speech_audio.wav"} + output_path (str, optional): "/your/output/audio_after_aec.wav" + the file path to write generate audio. + """ + + def __init__(self, model): + r""" + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model) + self.use_cuda = torch.cuda.is_available() + with open( + os.path.join(self.model, CONFIG_YAML), encoding='utf-8') as f: + self.config = yaml.full_load(f.read()) + self.config['io']['mvn'] = os.path.join(self.model, FEATURE_MVN) + self._init_model() + self.preprocessor = LinearAECAndFbank(self.config['io']) + + n_fft = self.config['loss']['args']['n_fft'] + hop_length = self.config['loss']['args']['hop_length'] + winlen = n_fft + window = torch.hamming_window(winlen, periodic=False) + + def stft(x): + return torch.stft( + x, + n_fft, + hop_length, + winlen, + center=False, + window=window.to(x.device), + return_complex=False) + + def istft(x, slen): + return torch.istft( + x, + n_fft, + hop_length, + winlen, + window=window.to(x.device), + center=False, + length=slen) + + self.stft = stft + self.istft = istft + + def _init_model(self): + checkpoint = torch.load( + os.path.join(self.model, ModelFile.TORCH_MODEL_BIN_FILE), + map_location='cpu') + self.model = initialize_config(self.config['nnet']) + if self.use_cuda: + self.model = self.model.cuda() + self.model.load_state_dict(checkpoint) + + def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + r"""The AEC process. + + Args: + inputs: dict={'feature': Tensor, 'base': Tensor} + 'feature' feature of input audio. + 'base' the base audio to mask. + + Returns: + dict: + { + 'output_pcm': generated audio array + } + """ + output_data = self._process(inputs['feature'], inputs['base']) + return {'output_pcm': output_data} + + def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]: + r"""The post process. Will save audio to file, if the output_path is given. + + Args: + inputs: dict: + { + 'output_pcm': generated audio array + } + kwargs: accept 'output_path' which is the path to write generated audio + + Returns: + dict: + { + 'output_pcm': generated audio array + } + """ + if 'output_path' in kwargs.keys(): + wav.write(kwargs['output_path'], self.preprocessor.SAMPLE_RATE, + inputs['output_pcm'].astype(np.int16)) + inputs['output_pcm'] = inputs['output_pcm'] / 32768.0 + return inputs + + def _process(self, fbanks, mixture): + if self.use_cuda: + fbanks = fbanks.cuda() + mixture = mixture.cuda() + if self.model.vad: + with torch.no_grad(): + masks, vad = self.model(fbanks.unsqueeze(0)) + masks = masks.permute([2, 1, 0]) + else: + with torch.no_grad(): + masks = self.model(fbanks.unsqueeze(0)) + masks = masks.permute([2, 1, 0]) + spectrum = self.stft(mixture) + masked_spec = spectrum * masks + masked_sig = self.istft(masked_spec, len(mixture)).cpu().numpy() + return masked_sig diff --git a/modelscope/pipelines/audio/text_to_speech_pipeline.py b/modelscope/pipelines/audio/text_to_speech_pipeline.py new file mode 100644 index 00000000..ecd9daac --- /dev/null +++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py @@ -0,0 +1,46 @@ +import time +from typing import Any, Dict, List + +import numpy as np + +from modelscope.models import Model +from modelscope.models.audio.tts.am import SambertNetHifi16k +from modelscope.models.audio.tts.vocoder import Hifigan16k +from modelscope.pipelines.base import Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import TextToTacotronSymbols, build_preprocessor +from modelscope.utils.constant import Fields, Tasks + +__all__ = ['TextToSpeechSambertHifigan16kPipeline'] + + +@PIPELINES.register_module( + Tasks.text_to_speech, module_name=r'tts-sambert-hifigan-16k') +class TextToSpeechSambertHifigan16kPipeline(Pipeline): + + def __init__(self, + config_file: str = None, + model: List[Model] = None, + preprocessor: TextToTacotronSymbols = None, + **kwargs): + super().__init__( + config_file=config_file, + model=model, + preprocessor=preprocessor, + **kwargs) + assert len(model) == 2, 'model number should be 2' + self._am = model[0] + self._vocoder = model[1] + self._preprocessor = preprocessor + + def forward(self, inputs: Dict[str, Any]) -> Dict[str, np.ndarray]: + texts = inputs['texts'] + audio_total = np.empty((0), dtype='int16') + for line in texts: + line = line.strip().split('\t') + audio = self._vocoder.forward(self._am.forward(line[1])) + audio_total = np.append(audio_total, audio, axis=0) + return {'output': audio_total} + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index dc7b6aa6..30203750 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -13,18 +13,23 @@ PIPELINES = Registry('pipelines') DEFAULT_MODEL_FOR_PIPELINE = { # TaskName: (pipeline_module_name, model_repo) + Tasks.word_segmentation: + ('structbert-chinese-word-segmentation', + 'damo/nlp_structbert_word-segmentation_chinese-base'), Tasks.sentence_similarity: ('sbert-base-chinese-sentence-similarity', 'damo/nlp_structbert_sentence-similarity_chinese-base'), - Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting_damo'), + Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'), Tasks.text_classification: ('bert-sentiment-analysis', 'damo/bert-base-sst2'), - Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'), - Tasks.image_captioning: ('ofa', None), + Tasks.text_generation: ('palm2.0', + 'damo/nlp_palm2.0_text-generation_chinese-base'), + Tasks.image_captioning: ('ofa', 'damo/ofa_image-caption_coco_large_en'), Tasks.image_generation: ('person-image-cartoon', 'damo/cv_unet_person-image-cartoon_compound-models'), - Tasks.fill_mask: ('sbert', 'damo/nlp_structbert_fill-mask_chinese-large'), + Tasks.ocr_detection: ('ocr-detection', + 'damo/cv_resnet18_ocr-detection-line-level_damo'), Tasks.fill_mask: ('veco', 'damo/nlp_veco_fill-mask_large') } diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 79c85c19..767c90d7 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -1,2 +1,3 @@ from .image_cartoon_pipeline import ImageCartoonPipeline from .image_matting_pipeline import ImageMattingPipeline +from .ocr_detection_pipeline import OCRDetectionPipeline diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py new file mode 100644 index 00000000..9728e441 --- /dev/null +++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py @@ -0,0 +1,167 @@ +import math +import os +import os.path as osp +import sys +from typing import Any, Dict, List, Tuple, Union + +import cv2 +import numpy as np +import PIL +import tensorflow as tf +import tf_slim as slim + +from modelscope.pipelines.base import Input +from modelscope.preprocessors import load_image +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger +from ..base import Pipeline +from ..builder import PIPELINES +from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 +tf.compat.v1.disable_eager_execution() + +logger = get_logger() + +# constant +RBOX_DIM = 5 +OFFSET_DIM = 6 +WORD_POLYGON_DIM = 8 +OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1] + +FLAGS = tf.app.flags.FLAGS +tf.app.flags.DEFINE_float('node_threshold', 0.4, + 'Confidence threshold for nodes') +tf.app.flags.DEFINE_float('link_threshold', 0.6, + 'Confidence threshold for links') + + +@PIPELINES.register_module( + Tasks.ocr_detection, module_name=Tasks.ocr_detection) +class OCRDetectionPipeline(Pipeline): + + def __init__(self, model: str): + super().__init__(model=model) + model_path = osp.join( + osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER), + 'checkpoint-80000') + + config = tf.ConfigProto(allow_soft_placement=True) + config.gpu_options.allow_growth = True + self._session = tf.Session(config=config) + global_step = tf.get_variable( + 'global_step', [], + initializer=tf.constant_initializer(0), + dtype=tf.int64, + trainable=False) + variable_averages = tf.train.ExponentialMovingAverage( + 0.997, global_step) + self.input_images = tf.placeholder( + tf.float32, shape=[1, 1024, 1024, 3], name='input_images') + self.output = {} + + # detector + detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector() + all_maps = detector.build_model(self.input_images, is_training=False) + + # decode local predictions + all_nodes, all_links, all_reg = [], [], [] + for i, maps in enumerate(all_maps): + cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2] + reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE) + + cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2])) + + lnk_prob_pos = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, :2]) + lnk_prob_mut = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, 2:]) + lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1) + + all_nodes.append(cls_prob) + all_links.append(lnk_prob) + all_reg.append(reg_maps) + + # decode segments and links + image_size = tf.shape(self.input_images)[1:3] + segments, group_indices, segment_counts, _ = ops.decode_segments_links_python( + image_size, + all_nodes, + all_links, + all_reg, + anchor_sizes=list(detector.anchor_sizes)) + + # combine segments + combined_rboxes, combined_counts = ops.combine_segments_python( + segments, group_indices, segment_counts) + self.output['combined_rboxes'] = combined_rboxes + self.output['combined_counts'] = combined_counts + + with self._session.as_default() as sess: + logger.info(f'loading model from {model_path}') + # load model + model_loader = tf.train.Saver( + variable_averages.variables_to_restore()) + model_loader.restore(sess, model_path) + + def preprocess(self, input: Input) -> Dict[str, Any]: + if isinstance(input, str): + img = np.array(load_image(input)) + elif isinstance(input, PIL.Image.Image): + img = np.array(input.convert('RGB')) + elif isinstance(input, np.ndarray): + if len(input.shape) == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + img = input[:, :, ::-1] # in rgb order + else: + raise TypeError(f'input should be either str, PIL.Image,' + f' np.array, but got {type(input)}') + h, w, c = img.shape + img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32) + img_pad[:h, :w, :] = img + + resize_size = 1024 + img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size)) + img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR) + img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94], + dtype=np.float32) + + resize_size = tf.stack([resize_size, resize_size]) + orig_size = tf.stack([max(h, w), max(h, w)]) + self.output['orig_size'] = orig_size + self.output['resize_size'] = resize_size + + result = {'img': np.expand_dims(img_pad_resize, axis=0)} + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + with self._session.as_default(): + feed_dict = {self.input_images: input['img']} + sess_outputs = self._session.run(self.output, feed_dict=feed_dict) + return sess_outputs + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + rboxes = inputs['combined_rboxes'][0] + count = inputs['combined_counts'][0] + rboxes = rboxes[:count, :] + + # convert rboxes to polygons and find its coordinates on the original image + orig_h, orig_w = inputs['orig_size'] + resize_h, resize_w = inputs['resize_size'] + polygons = utils.rboxes_to_polygons(rboxes) + scale_y = float(orig_h) / float(resize_h) + scale_x = float(orig_w) / float(resize_w) + + # confine polygons inside image + polygons[:, ::2] = np.maximum( + 0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1)) + polygons[:, 1::2] = np.maximum( + 0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1)) + polygons = np.round(polygons).astype(np.int32) + + # nms + dt_n9 = [o + [utils.cal_width(o)] for o in polygons.tolist()] + dt_nms = utils.nms_python(dt_n9) + dt_polygons = np.array([o[:8] for o in dt_nms]) + + result = {'det_polygons': dt_polygons} + return result diff --git a/modelscope/pipelines/cv/ocr_utils/__init__.py b/modelscope/pipelines/cv/ocr_utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py new file mode 100644 index 00000000..50b8ba02 --- /dev/null +++ b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py @@ -0,0 +1,158 @@ +import tensorflow as tf +import tf_slim as slim + +from . import ops, resnet18_v1, resnet_utils + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 + +# constants +OFFSET_DIM = 6 + +N_LOCAL_LINKS = 8 +N_CROSS_LINKS = 4 +N_SEG_CLASSES = 2 +N_LNK_CLASSES = 4 + +POS_LABEL = 1 +NEG_LABEL = 0 + + +class SegLinkDetector(): + + def __init__(self): + self.anchor_sizes = [6., 11.84210526, 23.68421053, 45., 90., 150.] + + def _detection_classifier(self, + maps, + ksize, + weight_decay, + cross_links=False, + scope=None): + + with tf.variable_scope(scope): + seg_depth = N_SEG_CLASSES + if cross_links: + lnk_depth = N_LNK_CLASSES * (N_LOCAL_LINKS + N_CROSS_LINKS) + else: + lnk_depth = N_LNK_CLASSES * N_LOCAL_LINKS + reg_depth = OFFSET_DIM + map_depth = maps.get_shape()[3] + inter_maps, inter_relu = ops.conv2d( + maps, map_depth, 256, 1, 1, 'SAME', scope='conv_inter') + + dir_maps, dir_relu = ops.conv2d( + inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_dir') + cen_maps, cen_relu = ops.conv2d( + inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_cen') + pol_maps, pol_relu = ops.conv2d( + inter_relu, 256, 8, ksize, 1, 'SAME', scope='conv_pol') + concat_relu = tf.concat([dir_relu, cen_relu, pol_relu], axis=-1) + _, lnk_embedding = ops.conv_relu( + concat_relu, 12, 256, 1, 1, scope='lnk_embedding') + lnk_maps, lnk_relu = ops.conv2d( + inter_relu + lnk_embedding, + 256, + lnk_depth, + ksize, + 1, + 'SAME', + scope='conv_lnk') + + char_seg_maps, char_seg_relu = ops.conv2d( + inter_relu, + 256, + seg_depth, + ksize, + 1, + 'SAME', + scope='conv_char_cls') + char_reg_maps, char_reg_relu = ops.conv2d( + inter_relu, + 256, + reg_depth, + ksize, + 1, + 'SAME', + scope='conv_char_reg') + concat_char_relu = tf.concat([char_seg_relu, char_reg_relu], + axis=-1) + _, char_embedding = ops.conv_relu( + concat_char_relu, 8, 256, 1, 1, scope='conv_char_embedding') + seg_maps, seg_relu = ops.conv2d( + inter_relu + char_embedding, + 256, + seg_depth, + ksize, + 1, + 'SAME', + scope='conv_cls') + reg_maps, reg_relu = ops.conv2d( + inter_relu + char_embedding, + 256, + reg_depth, + ksize, + 1, + 'SAME', + scope='conv_reg') + + return seg_relu, lnk_relu, reg_relu + + def _build_cnn(self, images, weight_decay, is_training): + with slim.arg_scope( + resnet18_v1.resnet_arg_scope(weight_decay=weight_decay)): + logits, end_points = resnet18_v1.resnet_v1_18( + images, is_training=is_training, scope='resnet_v1_18') + + outputs = { + 'conv3_3': end_points['pool1'], + 'conv4_3': end_points['pool2'], + 'fc7': end_points['pool3'], + 'conv8_2': end_points['pool4'], + 'conv9_2': end_points['pool5'], + 'conv10_2': end_points['pool6'], + } + return outputs + + def build_model(self, images, is_training=True, scope=None): + + weight_decay = 5e-4 # FLAGS.weight_decay + cnn_outputs = self._build_cnn(images, weight_decay, is_training) + det_0 = self._detection_classifier( + cnn_outputs['conv3_3'], + 3, + weight_decay, + cross_links=False, + scope='dete_0') + det_1 = self._detection_classifier( + cnn_outputs['conv4_3'], + 3, + weight_decay, + cross_links=True, + scope='dete_1') + det_2 = self._detection_classifier( + cnn_outputs['fc7'], + 3, + weight_decay, + cross_links=True, + scope='dete_2') + det_3 = self._detection_classifier( + cnn_outputs['conv8_2'], + 3, + weight_decay, + cross_links=True, + scope='dete_3') + det_4 = self._detection_classifier( + cnn_outputs['conv9_2'], + 3, + weight_decay, + cross_links=True, + scope='dete_4') + det_5 = self._detection_classifier( + cnn_outputs['conv10_2'], + 3, + weight_decay, + cross_links=True, + scope='dete_5') + outputs = [det_0, det_1, det_2, det_3, det_4, det_5] + return outputs diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py new file mode 100644 index 00000000..2bc8a8bf --- /dev/null +++ b/modelscope/pipelines/cv/ocr_utils/ops.py @@ -0,0 +1,1098 @@ +import math +import os +import shutil +import uuid + +import cv2 +import numpy as np +import tensorflow as tf + +from . import utils + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 + +FLAGS = tf.app.flags.FLAGS +tf.app.flags.DEFINE_string('weight_init_method', 'xavier', + 'Weight initialization method') + +# constants +OFFSET_DIM = 6 +RBOX_DIM = 5 + +N_LOCAL_LINKS = 8 +N_CROSS_LINKS = 4 +N_SEG_CLASSES = 2 +N_LNK_CLASSES = 4 + +MATCH_STATUS_POS = 1 +MATCH_STATUS_NEG = -1 +MATCH_STATUS_IGNORE = 0 +MUT_LABEL = 3 +POS_LABEL = 1 +NEG_LABEL = 0 + +N_DET_LAYERS = 6 + + +def load_oplib(lib_name): + """ + Load TensorFlow operator library. + """ + # use absolute path so that ops.py can be called from other directory + lib_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + 'lib{0}.so'.format(lib_name)) + # duplicate library with a random new name so that + # a running program will not be interrupted when the original library is updated + lib_copy_path = '/tmp/lib{0}_{1}.so'.format( + str(uuid.uuid4())[:8], LIB_NAME) + shutil.copyfile(lib_path, lib_copy_path) + oplib = tf.load_op_library(lib_copy_path) + return oplib + + +def _nn_variable(name, shape, init_method, collection=None, **kwargs): + """ + Create or reuse a variable + ARGS + name: variable name + shape: variable shape + init_method: 'zero', 'kaiming', 'xavier', or (mean, std) + collection: if not none, add variable to this collection + kwargs: extra paramters passed to tf.get_variable + RETURN + var: a new or existing variable + """ + if init_method == 'zero': + initializer = tf.constant_initializer(0.0) + elif init_method == 'kaiming': + if len(shape) == 4: # convolutional filters + kh, kw, n_in = shape[:3] + init_std = math.sqrt(2.0 / (kh * kw * n_in)) + elif len(shape) == 2: # linear weights + n_in, n_out = shape + init_std = math.sqrt(1.0 / n_out) + else: + raise 'Unsupported shape' + initializer = tf.truncated_normal_initializer(0.0, init_std) + elif init_method == 'xavier': + if len(shape) == 4: + initializer = tf.keras.initializers.glorot_normal() + else: + initializer = tf.keras.initializers.glorot_normal() + elif isinstance(init_method, tuple): + assert (len(init_method) == 2) + initializer = tf.truncated_normal_initializer(init_method[0], + init_method[1]) + else: + raise 'Unsupported weight initialization method: ' + init_method + + var = tf.get_variable(name, shape=shape, initializer=initializer, **kwargs) + if collection is not None: + tf.add_to_collection(collection, var) + + return var + + +def conv2d(x, + n_in, + n_out, + ksize, + stride=1, + padding='SAME', + weight_init=None, + bias=True, + relu=False, + scope=None, + **kwargs): + weight_init = weight_init or FLAGS.weight_init_method + trainable = kwargs.get('trainable', True) + # input_dim = n_in + if (padding == 'SAME'): + in_height = x.get_shape()[1] + in_width = x.get_shape()[2] + if (in_height % stride == 0): + pad_along_height = max(ksize - stride, 0) + else: + pad_along_height = max(ksize - (in_height % stride), 0) + if (in_width % stride == 0): + pad_along_width = max(ksize - stride, 0) + else: + pad_along_width = max(ksize - (in_width % stride), 0) + pad_bottom = pad_along_height // 2 + pad_top = pad_along_height - pad_bottom + pad_right = pad_along_width // 2 + pad_left = pad_along_width - pad_right + paddings = tf.constant([[0, 0], [pad_top, pad_bottom], + [pad_left, pad_right], [0, 0]]) + input_padded = tf.pad(x, paddings, 'CONSTANT') + else: + input_padded = x + + with tf.variable_scope(scope or 'conv2d'): + # convolution + kernel = _nn_variable( + 'weight', [ksize, ksize, n_in, n_out], + weight_init, + collection='weights' if trainable else None, + **kwargs) + yc = tf.nn.conv2d( + input_padded, kernel, [1, stride, stride, 1], padding='VALID') + # add bias + if bias is True: + bias = _nn_variable( + 'bias', [n_out], + 'zero', + collection='biases' if trainable else None, + **kwargs) + yb = tf.nn.bias_add(yc, bias) + # apply ReLU + y = yb + if relu is True: + y = tf.nn.relu(yb) + return yb, y + + +def group_conv2d_relu(x, + n_in, + n_out, + ksize, + stride=1, + group=4, + padding='SAME', + weight_init=None, + bias=True, + relu=False, + name='group_conv2d', + **kwargs): + group_axis = len(x.get_shape()) - 1 + splits = tf.split(x, [int(n_in / group)] * group, group_axis) + + conv_list = [] + for i in range(group): + conv_split, relu_split = conv2d( + splits[i], + n_in / group, + n_out / group, + ksize=ksize, + stride=stride, + padding=padding, + weight_init=weight_init, + bias=bias, + relu=relu, + scope='%s_%d' % (name, i)) + conv_list.append(conv_split) + conv = tf.concat(values=conv_list, axis=group_axis, name=name + '_concat') + relu = tf.nn.relu(conv) + return conv, relu + + +def group_conv2d_bn_relu(x, + n_in, + n_out, + ksize, + stride=1, + group=4, + padding='SAME', + weight_init=None, + bias=True, + relu=False, + name='group_conv2d', + **kwargs): + group_axis = len(x.get_shape()) - 1 + splits = tf.split(x, [int(n_in / group)] * group, group_axis) + + conv_list = [] + for i in range(group): + conv_split, relu_split = conv2d( + splits[i], + n_in / group, + n_out / group, + ksize=ksize, + stride=stride, + padding=padding, + weight_init=weight_init, + bias=bias, + relu=relu, + scope='%s_%d' % (name, i)) + conv_list.append(conv_split) + conv = tf.concat(values=conv_list, axis=group_axis, name=name + '_concat') + with tf.variable_scope(name + '_bn'): + bn = tf.layers.batch_normalization( + conv, momentum=0.9, epsilon=1e-5, scale=True, training=True) + relu = tf.nn.relu(bn) + return conv, relu + + +def next_conv(x, + n_in, + n_out, + ksize, + stride=1, + group=4, + padding='SAME', + weight_init=None, + bias=True, + relu=False, + name='next_conv2d', + **kwargs): + conv_a, relu_a = conv_relu( + x, + n_in, + n_in / 2, + ksize=1, + stride=1, + padding=padding, + weight_init=weight_init, + bias=bias, + relu=relu, + scope=name + '_a', + **kwargs) + + conv_b, relu_b = group_conv2d_relu( + relu_a, + n_in / 2, + n_out / 2, + ksize=ksize, + stride=stride, + group=group, + padding=padding, + weight_init=weight_init, + bias=bias, + relu=relu, + name=name + '_b', + **kwargs) + + conv_c, relu_c = conv_relu( + relu_b, + n_out / 2, + n_out, + ksize=1, + stride=1, + padding=padding, + weight_init=weight_init, + bias=bias, + relu=relu, + scope=name + '_c', + **kwargs) + + return conv_c, relu_c + + +def next_conv_bn(x, + n_in, + n_out, + ksize, + stride=1, + group=4, + padding='SAME', + weight_init=None, + bias=True, + relu=False, + name='next_conv2d', + **kwargs): + conv_a, relu_a = conv_bn_relu( + x, + n_in, + n_in / 2, + ksize=1, + stride=1, + padding=padding, + weight_init=weight_init, + bias=bias, + relu=relu, + scope=name + '_a', + **kwargs) + + conv_b, relu_b = group_conv2d_bn_relu( + relu_a, + n_in / 2, + n_out / 2, + ksize=ksize, + stride=stride, + group=group, + padding=padding, + weight_init=weight_init, + bias=bias, + relu=relu, + name=name + '_b', + **kwargs) + + conv_c, relu_c = conv_bn_relu( + relu_b, + n_out / 2, + n_out, + ksize=1, + stride=1, + padding=padding, + weight_init=weight_init, + bias=bias, + relu=relu, + scope=name + '_c', + **kwargs) + + return conv_c, relu_c + + +def conv2d_ori(x, + n_in, + n_out, + ksize, + stride=1, + padding='SAME', + weight_init=None, + bias=True, + relu=False, + scope=None, + **kwargs): + weight_init = weight_init or FLAGS.weight_init_method + trainable = kwargs.get('trainable', True) + + with tf.variable_scope(scope or 'conv2d'): + # convolution + kernel = _nn_variable( + 'weight', [ksize, ksize, n_in, n_out], + weight_init, + collection='weights' if trainable else None, + **kwargs) + y = tf.nn.conv2d(x, kernel, [1, stride, stride, 1], padding=padding) + # add bias + if bias is True: + bias = _nn_variable( + 'bias', [n_out], + 'zero', + collection='biases' if trainable else None, + **kwargs) + y = tf.nn.bias_add(y, bias) + # apply ReLU + if relu is True: + y = tf.nn.relu(y) + return y + + +def conv_relu(*args, **kwargs): + kwargs['relu'] = True + if 'scope' not in kwargs: + kwargs['scope'] = 'conv_relu' + return conv2d(*args, **kwargs) + + +def conv_bn_relu(*args, **kwargs): + kwargs['relu'] = True + if 'scope' not in kwargs: + kwargs['scope'] = 'conv_relu' + conv, relu = conv2d(*args, **kwargs) + with tf.variable_scope(kwargs['scope'] + '_bn'): + bn = tf.layers.batch_normalization( + conv, momentum=0.9, epsilon=1e-5, scale=True, training=True) + bn_relu = tf.nn.relu(bn) + return bn, bn_relu + + +def conv_relu_ori(*args, **kwargs): + kwargs['relu'] = True + if 'scope' not in kwargs: + kwargs['scope'] = 'conv_relu' + return conv2d_ori(*args, **kwargs) + + +def atrous_conv2d(x, + n_in, + n_out, + ksize, + dilation, + padding='SAME', + weight_init=None, + bias=True, + relu=False, + scope=None, + **kwargs): + weight_init = weight_init or FLAGS.weight_init_method + trainable = kwargs.get('trainable', True) + with tf.variable_scope(scope or 'atrous_conv2d'): + # atrous convolution + kernel = _nn_variable( + 'weight', [ksize, ksize, n_in, n_out], + weight_init, + collection='weights' if trainable else None, + **kwargs) + y = tf.nn.atrous_conv2d(x, kernel, dilation, padding=padding) + # add bias + if bias is True: + bias = _nn_variable( + 'bias', [n_out], + 'zero', + collection='biases' if trainable else None, + **kwargs) + y = tf.nn.bias_add(y, bias) + # apply ReLU + if relu is True: + y = tf.nn.relu(y) + return y + + +def avg_pool(x, ksize, stride, padding='SAME', scope=None): + with tf.variable_scope(scope or 'avg_pool'): + y = tf.nn.avg_pool(x, [1, ksize, ksize, 1], [1, stride, stride, 1], + padding) + return y + + +def max_pool(x, ksize, stride, padding='SAME', scope=None): + with tf.variable_scope(scope or 'max_pool'): + y = tf.nn.max_pool(x, [1, ksize, ksize, 1], [1, stride, stride, 1], + padding) + return y + + +def score_loss(gt_labels, match_scores, n_classes): + """ + Classification loss + ARGS + gt_labels: int32 [n] + match_scores: [n, n_classes] + RETURN + loss + """ + embeddings = tf.one_hot(tf.cast(gt_labels, tf.int64), n_classes, 1.0, 0.0) + losses = tf.nn.softmax_cross_entropy_with_logits(match_scores, embeddings) + return tf.reduce_sum(losses) + + +def smooth_l1_loss(offsets, gt_offsets, scope=None): + """ + Smooth L1 loss between offsets and encoded_gt + ARGS + offsets: [m?, 5], predicted offsets for one example + gt_offsets: [m?, 5], correponding groundtruth offsets + RETURN + loss: scalar + """ + with tf.variable_scope(scope or 'smooth_l1_loss'): + gt_offsets = tf.stop_gradient(gt_offsets) + diff = tf.abs(offsets - gt_offsets) + lesser_mask = tf.cast(tf.less(diff, 1.0), tf.float32) + larger_mask = 1.0 - lesser_mask + losses1 = (0.5 * tf.square(diff)) * lesser_mask + losses2 = (diff - 0.5) * larger_mask + return tf.reduce_sum(losses1 + losses2, 1) + + +def polygon_to_rboxe(polygon): + x1 = polygon[0] + y1 = polygon[1] + x2 = polygon[2] + y2 = polygon[3] + x3 = polygon[4] + y3 = polygon[5] + x4 = polygon[6] + y4 = polygon[7] + c_x = (x1 + x2 + x3 + x4) / 4 + c_y = (y1 + y2 + y3 + y4) / 4 + w1 = point_dist(x1, y1, x2, y2) + w2 = point_dist(x3, y3, x4, y4) + h1 = point_line_dist(c_x, c_y, x1, y1, x2, y2) + h2 = point_line_dist(c_x, c_y, x3, y3, x4, y4) + h = h1 + h2 + w = (w1 + w2) / 2 + theta1 = np.arctan2(y2 - y1, x2 - x1) + theta2 = np.arctan2(y3 - y4, x3 - x4) + theta = (theta1 + theta2) / 2 + return np.array([c_x, c_y, w, h, theta]) + + +def point_dist(x1, y1, x2, y2): + return np.sqrt((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1)) + + +def point_line_dist(px, py, x1, y1, x2, y2): + eps = 1e-6 + dx = x2 - x1 + dy = y2 - y1 + div = np.sqrt(dx * dx + dy * dy) + eps + dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div + return dist + + +def get_combined_polygon(rboxes, resize_size): + image_w = resize_size[1] + image_h = resize_size[0] + img = np.zeros((image_h, image_w, 3), np.uint8) + for i in range(rboxes.shape[0]): + segment = np.reshape( + np.array(utils.rboxes_to_polygons(rboxes)[i, :], np.int32), + (-1, 1, 2)) + cv2.drawContours(img, [segment], 0, (255, 255, 255), -1) + img2gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + ret, thresh = cv2.threshold(img2gray, 127, 255, cv2.THRESH_BINARY) + im2, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, + cv2.CHAIN_APPROX_SIMPLE) + if len(contours) > 0: + cnt = contours[0] + max_area = cv2.contourArea(cnt) + # get max_area + for cont in contours: + if cv2.contourArea(cont) > max_area: + cnt = cont + max_area = cv2.contourArea(cont) + rect = cv2.minAreaRect(cnt) + combined_polygon = np.array(cv2.boxPoints(rect)).reshape(-1) + else: + combined_polygon = np.array([0, 0, 0, 0, 0, 0, 0, 0]) + + return combined_polygon + + +def combine_segs(segs): + segs = np.asarray(segs) + assert segs.ndim == 2, 'invalid segs ndim' + assert segs.shape[-1] == 6, 'invalid segs shape' + + if len(segs) == 1: + cx = segs[0, 0] + cy = segs[0, 1] + w = segs[0, 2] + h = segs[0, 3] + theta_sin = segs[0, 4] + theta_cos = segs[0, 5] + theta = np.arctan2(theta_sin, theta_cos) + return np.array([cx, cy, w, h, theta]) + + # find the best straight line fitting all center points: y = kx + b + cxs = segs[:, 0] + cys = segs[:, 1] + + theta_coss = segs[:, 4] + theta_sins = segs[:, 5] + + bar_theta = np.arctan2(theta_sins.sum(), theta_coss.sum()) + k = np.tan(bar_theta) + b = np.mean(cys - k * cxs) + + proj_xs = (k * cys + cxs - k * b) / (k**2 + 1) + proj_ys = (k * k * cys + k * cxs + b) / (k**2 + 1) + proj_points = np.stack((proj_xs, proj_ys), -1) + + # find the max distance + max_dist = -1 + idx1 = -1 + idx2 = -1 + + for i in range(len(proj_points)): + point1 = proj_points[i, :] + for j in range(i + 1, len(proj_points)): + point2 = proj_points[j, :] + dist = np.sqrt(np.sum((point1 - point2)**2)) + if dist > max_dist: + idx1 = i + idx2 = j + max_dist = dist + assert idx1 >= 0 and idx2 >= 0 + # the bbox: bcx, bcy, bw, bh, average_theta + seg1 = segs[idx1, :] + seg2 = segs[idx2, :] + bcx, bcy = (seg1[:2] + seg2[:2]) / 2.0 + bh = np.mean(segs[:, 3]) + bw = max_dist + (seg1[2] + seg2[2]) / 2.0 + return bcx, bcy, bw, bh, bar_theta + + +def combine_segments_batch(segments_batch, group_indices_batch, + segment_counts_batch): + batch_size = 1 + combined_rboxes_batch = [] + combined_counts_batch = [] + for image_id in range(batch_size): + group_count = segment_counts_batch[image_id] + segments = segments_batch[image_id, :, :] + group_indices = group_indices_batch[image_id, :] + combined_rboxes = [] + for i in range(group_count): + segments_group = segments[np.where(group_indices == i)[0], :] + if segments_group.shape[0] > 0: + combined_rbox = combine_segs(segments_group) + combined_rboxes.append(combined_rbox) + combined_rboxes_batch.append(combined_rboxes) + combined_counts_batch.append(len(combined_rboxes)) + + max_count = np.max(combined_counts_batch) + for image_id in range(batch_size): + if not combined_counts_batch[image_id] == max_count: + combined_rboxes_pad = (max_count - combined_counts_batch[image_id] + ) * [RBOX_DIM * [0.0]] + combined_rboxes_batch[image_id] = np.vstack( + (combined_rboxes_batch[image_id], + np.array(combined_rboxes_pad))) + + return np.asarray(combined_rboxes_batch, + np.float32), np.asarray(combined_counts_batch, np.int32) + + +# combine_segments rewrite in python version +def combine_segments_python(segments, group_indices, segment_counts): + combined_rboxes, combined_counts = tf.py_func( + combine_segments_batch, [segments, group_indices, segment_counts], + [tf.float32, tf.int32]) + return combined_rboxes, combined_counts + + +# decode_segments_links rewrite in python version +def get_coord(offsets, map_size, offsets_defaults): + if offsets < offsets_defaults[1][0]: + l_idx = 0 + x = offsets % map_size[0][1] + y = offsets // map_size[0][1] + elif offsets < offsets_defaults[2][0]: + l_idx = 1 + x = (offsets - offsets_defaults[1][0]) % map_size[1][1] + y = (offsets - offsets_defaults[1][0]) // map_size[1][1] + elif offsets < offsets_defaults[3][0]: + l_idx = 2 + x = (offsets - offsets_defaults[2][0]) % map_size[2][1] + y = (offsets - offsets_defaults[2][0]) // map_size[2][1] + elif offsets < offsets_defaults[4][0]: + l_idx = 3 + x = (offsets - offsets_defaults[3][0]) % map_size[3][1] + y = (offsets - offsets_defaults[3][0]) // map_size[3][1] + elif offsets < offsets_defaults[5][0]: + l_idx = 4 + x = (offsets - offsets_defaults[4][0]) % map_size[4][1] + y = (offsets - offsets_defaults[4][0]) // map_size[4][1] + else: + l_idx = 5 + x = (offsets - offsets_defaults[5][0]) % map_size[5][1] + y = (offsets - offsets_defaults[5][0]) // map_size[5][1] + + return l_idx, x, y + + +def get_coord_link(offsets, map_size, offsets_defaults): + if offsets < offsets_defaults[1][1]: + offsets_node = offsets // N_LOCAL_LINKS + link_idx = offsets % N_LOCAL_LINKS + else: + offsets_node = (offsets - offsets_defaults[1][1]) // ( + N_LOCAL_LINKS + N_CROSS_LINKS) + offsets_defaults[1][0] + link_idx = (offsets - offsets_defaults[1][1]) % ( + N_LOCAL_LINKS + N_CROSS_LINKS) + l_idx, x, y = get_coord(offsets_node, map_size, offsets_defaults) + return l_idx, x, y, link_idx + + +def is_valid_coord(l_idx, x, y, map_size): + w = map_size[l_idx][1] + h = map_size[l_idx][0] + return x >= 0 and x < w and y >= 0 and y < h + + +def get_neighbours(l_idx, x, y, map_size, offsets_defaults): + if l_idx == 0: + coord = [(0, x - 1, y - 1), (0, x, y - 1), (0, x + 1, y - 1), + (0, x - 1, y), (0, x + 1, y), (0, x - 1, y + 1), + (0, x, y + 1), (0, x + 1, y + 1)] + else: + coord = [(l_idx, x - 1, y - 1), + (l_idx, x, y - 1), (l_idx, x + 1, y - 1), (l_idx, x - 1, y), + (l_idx, x + 1, y), (l_idx, x - 1, y + 1), (l_idx, x, y + 1), + (l_idx, x + 1, y + 1), (l_idx - 1, 2 * x, 2 * y), + (l_idx - 1, 2 * x + 1, 2 * y), (l_idx - 1, 2 * x, 2 * y + 1), + (l_idx - 1, 2 * x + 1, 2 * y + 1)] + neighbours_offsets = [] + link_idx = 0 + for nl_idx, nx, ny in coord: + if is_valid_coord(nl_idx, nx, ny, map_size): + neighbours_offset_node = offsets_defaults[nl_idx][ + 0] + map_size[nl_idx][1] * ny + nx + if l_idx == 0: + neighbours_offset_link = offsets_defaults[l_idx][1] + ( + map_size[l_idx][1] * y + x) * N_LOCAL_LINKS + link_idx + else: + off_tmp = (map_size[l_idx][1] * y + x) * ( + N_LOCAL_LINKS + N_CROSS_LINKS) + neighbours_offset_link = offsets_defaults[l_idx][ + 1] + off_tmp + link_idx + neighbours_offsets.append( + [neighbours_offset_node, neighbours_offset_link, link_idx]) + link_idx += 1 + # [node_offsets, link_offsets, link_idx(0-7/11)] + return neighbours_offsets + + +def decode_segments_links_python(image_size, all_nodes, all_links, all_reg, + anchor_sizes): + batch_size = 1 # FLAGS.test_batch_size + # offsets = 12285 #768 + all_nodes_flat = tf.concat( + [tf.reshape(o, [batch_size, -1, N_SEG_CLASSES]) for o in all_nodes], + axis=1) + all_links_flat = tf.concat( + [tf.reshape(o, [batch_size, -1, N_LNK_CLASSES]) for o in all_links], + axis=1) + all_reg_flat = tf.concat( + [tf.reshape(o, [batch_size, -1, OFFSET_DIM]) for o in all_reg], axis=1) + segments, group_indices, segment_counts, group_indices_all = tf.py_func( + decode_batch, [ + all_nodes_flat, all_links_flat, all_reg_flat, image_size, + tf.constant(anchor_sizes) + ], [tf.float32, tf.int32, tf.int32, tf.int32]) + return segments, group_indices, segment_counts, group_indices_all + + +def decode_segments_links_train(image_size, all_nodes, all_links, all_reg, + anchor_sizes): + batch_size = FLAGS.train_batch_size + # offsets = 12285 #768 + all_nodes_flat = tf.concat( + [tf.reshape(o, [batch_size, -1, N_SEG_CLASSES]) for o in all_nodes], + axis=1) + all_links_flat = tf.concat( + [tf.reshape(o, [batch_size, -1, N_LNK_CLASSES]) for o in all_links], + axis=1) + all_reg_flat = tf.concat( + [tf.reshape(o, [batch_size, -1, OFFSET_DIM]) for o in all_reg], axis=1) + segments, group_indices, segment_counts, group_indices_all = tf.py_func( + decode_batch, [ + all_nodes_flat, all_links_flat, all_reg_flat, image_size, + tf.constant(anchor_sizes) + ], [tf.float32, tf.int32, tf.int32, tf.int32]) + return segments, group_indices, segment_counts, group_indices_all + + +def decode_batch(all_nodes, all_links, all_reg, image_size, anchor_sizes): + batch_size = all_nodes.shape[0] + batch_segments = [] + batch_group_indices = [] + batch_segments_counts = [] + batch_group_indices_all = [] + for image_id in range(batch_size): + image_node_scores = all_nodes[image_id, :, :] + image_link_scores = all_links[image_id, :, :] + image_reg = all_reg[image_id, :, :] + image_segments, image_group_indices, image_segments_counts, image_group_indices_all = decode_image( + image_node_scores, image_link_scores, image_reg, image_size, + anchor_sizes) + batch_segments.append(image_segments) + batch_group_indices.append(image_group_indices) + batch_segments_counts.append(image_segments_counts) + batch_group_indices_all.append(image_group_indices_all) + max_count = np.max(batch_segments_counts) + for image_id in range(batch_size): + if not batch_segments_counts[image_id] == max_count: + batch_segments_pad = (max_count - batch_segments_counts[image_id] + ) * [OFFSET_DIM * [0.0]] + batch_segments[image_id] = np.vstack( + (batch_segments[image_id], np.array(batch_segments_pad))) + batch_group_indices[image_id] = np.hstack( + (batch_group_indices[image_id], + np.array( + (max_count - batch_segments_counts[image_id]) * [-1]))) + return np.asarray(batch_segments, np.float32), np.asarray( + batch_group_indices, + np.int32), np.asarray(batch_segments_counts, + np.int32), np.asarray(batch_group_indices_all, + np.int32) + + +def decode_image(image_node_scores, image_link_scores, image_reg, image_size, + anchor_sizes): + map_size = [] + offsets_defaults = [] + offsets_default_node = 0 + offsets_default_link = 0 + for i in range(N_DET_LAYERS): + offsets_defaults.append([offsets_default_node, offsets_default_link]) + map_size.append(image_size // (2**(2 + i))) + offsets_default_node += map_size[i][0] * map_size[i][1] + if i == 0: + offsets_default_link += map_size[i][0] * map_size[i][ + 1] * N_LOCAL_LINKS + else: + offsets_default_link += map_size[i][0] * map_size[i][1] * ( + N_LOCAL_LINKS + N_CROSS_LINKS) + + image_group_indices_all = decode_image_by_join(image_node_scores, + image_link_scores, + FLAGS.node_threshold, + FLAGS.link_threshold, + map_size, offsets_defaults) + image_group_indices_all -= 1 + image_group_indices = image_group_indices_all[np.where( + image_group_indices_all >= 0)[0]] + image_segments_counts = len(image_group_indices) + # convert image_reg to segments with scores(OFFSET_DIM+1) + image_segments = np.zeros((image_segments_counts, OFFSET_DIM), + dtype=np.float32) + for i, offsets in enumerate(np.where(image_group_indices_all >= 0)[0]): + encoded_cx = image_reg[offsets, 0] + encoded_cy = image_reg[offsets, 1] + encoded_width = image_reg[offsets, 2] + encoded_height = image_reg[offsets, 3] + encoded_theta_cos = image_reg[offsets, 4] + encoded_theta_sin = image_reg[offsets, 5] + + l_idx, x, y = get_coord(offsets, map_size, offsets_defaults) + rs = anchor_sizes[l_idx] + eps = 1e-6 + image_segments[i, 0] = encoded_cx * rs + (2**(2 + l_idx)) * (x + 0.5) + image_segments[i, 1] = encoded_cy * rs + (2**(2 + l_idx)) * (y + 0.5) + image_segments[i, 2] = np.exp(encoded_width) * rs - eps + image_segments[i, 3] = np.exp(encoded_height) * rs - eps + image_segments[i, 4] = encoded_theta_cos + image_segments[i, 5] = encoded_theta_sin + + return image_segments, image_group_indices, image_segments_counts, image_group_indices_all + + +def decode_image_by_join(node_scores, link_scores, node_threshold, + link_threshold, map_size, offsets_defaults): + node_mask = node_scores[:, POS_LABEL] >= node_threshold + link_mask = link_scores[:, POS_LABEL] >= link_threshold + group_mask = np.zeros_like(node_mask, np.int32) - 1 + offsets_pos = np.where(node_mask == 1)[0] + + def find_parent(point): + return group_mask[point] + + def set_parent(point, parent): + group_mask[point] = parent + + def is_root(point): + return find_parent(point) == -1 + + def find_root(point): + root = point + update_parent = False + while not is_root(root): + root = find_parent(root) + update_parent = True + + # for acceleration of find_root + if update_parent: + set_parent(point, root) + + return root + + def join(p1, p2): + root1 = find_root(p1) + root2 = find_root(p2) + + if root1 != root2: + set_parent(root1, root2) + + def get_all(): + root_map = {} + + def get_index(root): + if root not in root_map: + root_map[root] = len(root_map) + 1 + return root_map[root] + + mask = np.zeros_like(node_mask, dtype=np.int32) + for i, point in enumerate(offsets_pos): + point_root = find_root(point) + bbox_idx = get_index(point_root) + mask[point] = bbox_idx + return mask + + # join by link + pos_link = 0 + for i, offsets in enumerate(offsets_pos): + l_idx, x, y = get_coord(offsets, map_size, offsets_defaults) + neighbours = get_neighbours(l_idx, x, y, map_size, offsets_defaults) + for n_idx, noffsets in enumerate(neighbours): + link_value = link_mask[noffsets[1]] + node_cls = node_mask[noffsets[0]] + if link_value and node_cls: + pos_link += 1 + join(offsets, noffsets[0]) + # print(pos_link) + mask = get_all() + return mask + + +def get_link_mask(node_mask, offsets_defaults, link_max): + link_mask = np.zeros_like(link_max) + link_mask[0:offsets_defaults[1][1]] = np.tile( + node_mask[0:offsets_defaults[1][0]], + (N_LOCAL_LINKS, 1)).transpose().reshape(offsets_defaults[1][1]) + link_mask[offsets_defaults[1][1]:offsets_defaults[2][1]] = np.tile( + node_mask[offsets_defaults[1][0]:offsets_defaults[2][0]], + (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape( + (offsets_defaults[2][1] - offsets_defaults[1][1])) + link_mask[offsets_defaults[2][1]:offsets_defaults[3][1]] = np.tile( + node_mask[offsets_defaults[2][0]:offsets_defaults[3][0]], + (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape( + (offsets_defaults[3][1] - offsets_defaults[2][1])) + link_mask[offsets_defaults[3][1]:offsets_defaults[4][1]] = np.tile( + node_mask[offsets_defaults[3][0]:offsets_defaults[4][0]], + (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape( + (offsets_defaults[4][1] - offsets_defaults[3][1])) + link_mask[offsets_defaults[4][1]:offsets_defaults[5][1]] = np.tile( + node_mask[offsets_defaults[4][0]:offsets_defaults[5][0]], + (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape( + (offsets_defaults[5][1] - offsets_defaults[4][1])) + link_mask[offsets_defaults[5][1]:] = np.tile( + node_mask[offsets_defaults[5][0]:], + (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape( + (len(link_mask) - offsets_defaults[5][1])) + + return link_mask + + +def get_link8(link_scores_raw, map_size): + # link[i-1] -local- start -16- end -cross- link[i] + link8_mask = np.zeros((link_scores_raw.shape[0])) + for i in range(N_DET_LAYERS): + if i == 0: + offsets_start = map_size[i][0] * map_size[i][1] * N_LOCAL_LINKS + offsets_end = map_size[i][0] * map_size[i][1] * ( + N_LOCAL_LINKS + 16) + offsets_link = map_size[i][0] * map_size[i][1] * ( + N_LOCAL_LINKS + 16) + link8_mask[:offsets_start] = 1 + else: + offsets_start = offsets_link + map_size[i][0] * map_size[i][ + 1] * N_LOCAL_LINKS + offsets_end = offsets_link + map_size[i][0] * map_size[i][1] * ( + N_LOCAL_LINKS + 16) + offsets_link_pre = offsets_link + offsets_link += map_size[i][0] * map_size[i][1] * ( + N_LOCAL_LINKS + 16 + N_CROSS_LINKS) + link8_mask[offsets_link_pre:offsets_start] = 1 + link8_mask[offsets_end:offsets_link] = 1 + return link_scores_raw[np.where(link8_mask > 0)[0], :] + + +def decode_image_by_mutex(node_scores, link_scores, node_threshold, + link_threshold, map_size, offsets_defaults): + node_mask = node_scores[:, POS_LABEL] >= node_threshold + link_pos = link_scores[:, POS_LABEL] + link_mut = link_scores[:, MUT_LABEL] + link_max = np.max(np.vstack((link_pos, link_mut)), axis=0) + + offsets_pos_list = np.where(node_mask == 1)[0].tolist() + + link_mask_th = link_max >= link_threshold + link_mask = get_link_mask(node_mask, offsets_defaults, link_max) + offsets_link_max = np.argsort(-(link_max * link_mask * link_mask_th)) + offsets_link_max = offsets_link_max[:len(offsets_pos_list) * 8] + + group_mask = np.zeros_like(node_mask, dtype=np.int32) - 1 + mutex_mask = len(node_mask) * [[]] + + def find_parent(point): + return group_mask[point] + + def set_parent(point, parent): + group_mask[point] = parent + + def set_mutex_constraint(point, mutex_point_list): + mutex_mask[point] = mutex_point_list + + def find_mutex_constraint(point): + mutex_point_list = mutex_mask[point] + # update mutex_point_list + mutex_point_list_new = [] + if not mutex_point_list == []: + for mutex_point in mutex_point_list: + if not is_root(mutex_point): + mutex_point = find_root(mutex_point) + if mutex_point not in mutex_point_list_new: + mutex_point_list_new.append(mutex_point) + set_mutex_constraint(point, mutex_point_list_new) + return mutex_point_list_new + + def combine_mutex_constraint(point, parent): + mutex_point_list = find_mutex_constraint(point) + mutex_parent_list = find_mutex_constraint(parent) + for mutex_point in mutex_point_list: + if not is_root(mutex_point): + mutex_point = find_root(mutex_point) + if mutex_point not in mutex_parent_list: + mutex_parent_list.append(mutex_point) + set_mutex_constraint(parent, mutex_parent_list) + + def add_mutex_constraint(p1, p2): + mutex_point_list1 = find_mutex_constraint(p1) + mutex_point_list2 = find_mutex_constraint(p2) + + if p1 not in mutex_point_list2: + mutex_point_list2.append(p1) + if p2 not in mutex_point_list1: + mutex_point_list1.append(p2) + set_mutex_constraint(p1, mutex_point_list1) + set_mutex_constraint(p2, mutex_point_list2) + + def is_root(point): + return find_parent(point) == -1 + + def find_root(point): + root = point + update_parent = False + while not is_root(root): + root = find_parent(root) + update_parent = True + + # for acceleration of find_root + if update_parent: + set_parent(point, root) + + return root + + def join(p1, p2): + root1 = find_root(p1) + root2 = find_root(p2) + + if root1 != root2 and (root1 not in find_mutex_constraint(root2)): + set_parent(root1, root2) + combine_mutex_constraint(root1, root2) + + def disjoin(p1, p2): + root1 = find_root(p1) + root2 = find_root(p2) + + if root1 != root2: + add_mutex_constraint(root1, root2) + + def get_all(): + root_map = {} + + def get_index(root): + if root not in root_map: + root_map[root] = len(root_map) + 1 + return root_map[root] + + mask = np.zeros_like(node_mask, dtype=np.int32) + for _, point in enumerate(offsets_pos_list): + point_root = find_root(point) + bbox_idx = get_index(point_root) + mask[point] = bbox_idx + return mask + + # join by link + pos_link = 0 + mut_link = 0 + for _, offsets_link in enumerate(offsets_link_max): + l_idx, x, y, link_idx = get_coord_link(offsets_link, map_size, + offsets_defaults) + offsets = offsets_defaults[l_idx][0] + map_size[l_idx][1] * y + x + if offsets in offsets_pos_list: + neighbours = get_neighbours(l_idx, x, y, map_size, + offsets_defaults) + if not len(np.where(np.array(neighbours)[:, + 2] == link_idx)[0]) == 0: + noffsets = neighbours[np.where( + np.array(neighbours)[:, 2] == link_idx)[0][0]] + link_pos_value = link_pos[noffsets[1]] + link_mut_value = link_mut[noffsets[1]] + node_cls = node_mask[noffsets[0]] + if node_cls and (link_pos_value > link_mut_value): + pos_link += 1 + join(offsets, noffsets[0]) + elif node_cls and (link_pos_value < link_mut_value): + mut_link += 1 + disjoin(offsets, noffsets[0]) + + mask = get_all() + return mask diff --git a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py new file mode 100644 index 00000000..6371d4e5 --- /dev/null +++ b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py @@ -0,0 +1,432 @@ +"""Contains definitions for the original form of Residual Networks. +The 'v1' residual networks (ResNets) implemented in this module were proposed +by: +[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition. arXiv:1512.03385 +Other variants were introduced in: +[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 +The networks defined in this module utilize the bottleneck building block of +[1] with projection shortcuts only for increasing depths. They employ batch +normalization *after* every weight layer. This is the architecture used by +MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and +ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1' +architecture and the alternative 'v2' architecture of [2] which uses batch +normalization *before* every weight layer in the so-called full pre-activation +units. +Typical use: + from tensorflow.contrib.slim.nets import resnet_v1 +ResNet-101 for image classification into 1000 classes: + # inputs has shape [batch, 224, 224, 3] + with slim.arg_scope(resnet_v1.resnet_arg_scope()): + net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False) +ResNet-101 for semantic segmentation into 21 classes: + # inputs has shape [batch, 513, 513, 3] + with slim.arg_scope(resnet_v1.resnet_arg_scope()): + net, end_points = resnet_v1.resnet_v1_101(inputs, + 21, + is_training=False, + global_pool=False, + output_stride=16) +""" +import tensorflow as tf +import tf_slim as slim + +from . import resnet_utils + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 + +resnet_arg_scope = resnet_utils.resnet_arg_scope + + +@slim.add_arg_scope +def basicblock(inputs, + depth, + depth_bottleneck, + stride, + rate=1, + outputs_collections=None, + scope=None): + """Bottleneck residual unit variant with BN after convolutions. + This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for + its definition. Note that we use here the bottleneck variant which has an + extra bottleneck layer. + When putting together two consecutive ResNet blocks that use this unit, one + should use stride = 2 in the last unit of the first block. + Args: + inputs: A tensor of size [batch, height, width, channels]. + depth: The depth of the ResNet unit output. + depth_bottleneck: The depth of the bottleneck layers. + stride: The ResNet unit's stride. Determines the amount of downsampling of + the units output compared to its input. + rate: An integer, rate for atrous convolution. + outputs_collections: Collection to add the ResNet unit output. + scope: Optional variable_scope. + Returns: + The ResNet unit's output. + """ + with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: + depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) + if depth == depth_in: + shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') + else: + shortcut = slim.conv2d( + inputs, + depth, [1, 1], + stride=stride, + activation_fn=None, + scope='shortcut') + + residual = resnet_utils.conv2d_same( + inputs, depth, 3, stride, rate=rate, scope='conv1') + residual = resnet_utils.conv2d_same( + residual, depth, 3, 1, rate=rate, scope='conv2') + + output = tf.nn.relu(residual + shortcut) + + return slim.utils.collect_named_outputs(outputs_collections, + sc.original_name_scope, output) + + +@slim.add_arg_scope +def bottleneck(inputs, + depth, + depth_bottleneck, + stride, + rate=1, + outputs_collections=None, + scope=None): + """Bottleneck residual unit variant with BN after convolutions. + This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for + its definition. Note that we use here the bottleneck variant which has an + extra bottleneck layer. + When putting together two consecutive ResNet blocks that use this unit, one + should use stride = 2 in the last unit of the first block. + Args: + inputs: A tensor of size [batch, height, width, channels]. + depth: The depth of the ResNet unit output. + depth_bottleneck: The depth of the bottleneck layers. + stride: The ResNet unit's stride. Determines the amount of downsampling of + the units output compared to its input. + rate: An integer, rate for atrous convolution. + outputs_collections: Collection to add the ResNet unit output. + scope: Optional variable_scope. + Returns: + The ResNet unit's output. + """ + with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: + depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) + if depth == depth_in: + shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') + else: + shortcut = slim.conv2d( + inputs, + depth, [1, 1], + stride=stride, + activation_fn=None, + scope='shortcut') + + residual = slim.conv2d( + inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1') + residual = resnet_utils.conv2d_same( + residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2') + residual = slim.conv2d( + residual, + depth, [1, 1], + stride=1, + activation_fn=None, + scope='conv3') + + output = tf.nn.relu(shortcut + residual) + + return slim.utils.collect_named_outputs(outputs_collections, + sc.original_name_scope, output) + + +def resnet_v1(inputs, + blocks, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + include_root_block=True, + spatial_squeeze=True, + reuse=None, + scope=None): + """Generator for v1 ResNet models. + This function generates a family of ResNet v1 models. See the resnet_v1_*() + methods for specific model instantiations, obtained by selecting different + block instantiations that produce ResNets of various depths. + Training for image classification on Imagenet is usually done with [224, 224] + inputs, resulting in [7, 7] feature maps at the output of the last ResNet + block for the ResNets defined in [1] that have nominal stride equal to 32. + However, for dense prediction tasks we advise that one uses inputs with + spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In + this case the feature maps at the ResNet output will have spatial shape + [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] + and corners exactly aligned with the input image corners, which greatly + facilitates alignment of the features to the image. Using as input [225, 225] + images results in [8, 8] feature maps at the output of the last ResNet block. + For dense prediction tasks, the ResNet needs to run in fully-convolutional + (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all + have nominal stride equal to 32 and a good choice in FCN mode is to use + output_stride=16 in order to increase the density of the computed features at + small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. + Args: + inputs: A tensor of size [batch, height_in, width_in, channels]. + blocks: A list of length equal to the number of ResNet blocks. Each element + is a resnet_utils.Block object describing the units in the block. + num_classes: Number of predicted classes for classification tasks. If None + we return the features before the logit layer. + is_training: whether is training or not. + global_pool: If True, we perform global average pooling before computing the + logits. Set to True for image classification, False for dense prediction. + output_stride: If None, then the output will be computed at the nominal + network stride. If output_stride is not None, it specifies the requested + ratio of input to output spatial resolution. + include_root_block: If True, include the initial convolution followed by + max-pooling, if False excludes it. + spatial_squeeze: if True, logits is of shape [B, C], if false logits is + of shape [B, 1, 1, C], where B is batch_size and C is number of classes. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + Returns: + net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. + If global_pool is False, then height_out and width_out are reduced by a + factor of output_stride compared to the respective height_in and width_in, + else both height_out and width_out equal one. If num_classes is None, then + net is the output of the last ResNet block, potentially after global + average pooling. If num_classes is not None, net contains the pre-softmax + activations. + end_points: A dictionary from components of the network to the corresponding + activation. + Raises: + ValueError: If the target output_stride is not valid. + """ + with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: + end_points_collection = sc.name + '_end_points' + with slim.arg_scope( + [slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense], + outputs_collections=end_points_collection): + with slim.arg_scope([slim.batch_norm], is_training=is_training): + net = inputs + if include_root_block: + if output_stride is not None: + if output_stride % 4 != 0: + raise ValueError( + 'The output_stride needs to be a multiple of 4.' + ) + output_stride /= 4 + net = resnet_utils.conv2d_same( + net, 64, 7, stride=2, scope='conv1') + net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]]) + net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') + + net = slim.utils.collect_named_outputs( + end_points_collection, 'pool2', net) + + net = resnet_utils.stack_blocks_dense(net, blocks, + output_stride) + + end_points = slim.utils.convert_collection_to_dict( + end_points_collection) + + end_points['pool1'] = end_points['resnet_v1_18/block2/unit_2'] + end_points['pool2'] = end_points['resnet_v1_18/block3/unit_2'] + end_points['pool3'] = end_points['resnet_v1_18/block4/unit_2'] + end_points['pool4'] = end_points['resnet_v1_18/block5/unit_2'] + end_points['pool5'] = end_points['resnet_v1_18/block6/unit_2'] + end_points['pool6'] = net + + return net, end_points + + +resnet_v1.default_image_size = 224 + + +def resnet_v1_18(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + spatial_squeeze=True, + reuse=None, + scope='resnet_v1_18'): + """ResNet-18 model of [1]. See resnet_v1() for arg and return description.""" + blocks = [ + resnet_utils.Block('block1', basicblock, + [(64, 64, 1)] + [(64, 64, 1)]), + resnet_utils.Block('block2', basicblock, + [(128, 128, 1)] + [(128, 128, 1)]), + resnet_utils.Block('block3', basicblock, + [(256, 256, 2)] + [(256, 256, 1)]), + resnet_utils.Block('block4', basicblock, + [(512, 512, 2)] + [(512, 512, 1)]), + resnet_utils.Block('block5', basicblock, + [(256, 256, 2)] + [(256, 256, 1)]), + resnet_utils.Block('block6', basicblock, + [(256, 256, 2)] + [(256, 256, 1)]), + resnet_utils.Block('block7', basicblock, + [(256, 256, 2)] + [(256, 256, 1)]), + ] + return resnet_v1( + inputs, + blocks, + num_classes, + is_training, + global_pool=global_pool, + output_stride=output_stride, + include_root_block=True, + spatial_squeeze=spatial_squeeze, + reuse=reuse, + scope=scope) + + +resnet_v1_18.default_image_size = resnet_v1.default_image_size + + +def resnet_v1_50(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + spatial_squeeze=True, + reuse=None, + scope='resnet_v1_50'): + """ResNet-50 model of [1]. See resnet_v1() for arg and return description.""" + blocks = [ + resnet_utils.Block('block1', bottleneck, + [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block('block2', bottleneck, + [(512, 128, 1)] * 3 + [(512, 128, 2)]), + resnet_utils.Block('block3', bottleneck, + [(1024, 256, 1)] * 5 + [(1024, 256, 2)]), + resnet_utils.Block('block4', bottleneck, + [(2048, 512, 1)] * 3 + [(2048, 512, 2)]), + resnet_utils.Block('block5', bottleneck, + [(1024, 256, 1)] * 2 + [(1024, 256, 2)]), + resnet_utils.Block('block6', bottleneck, [(1024, 256, 1)] * 2), + ] + return resnet_v1( + inputs, + blocks, + num_classes, + is_training, + global_pool=global_pool, + output_stride=output_stride, + include_root_block=True, + spatial_squeeze=spatial_squeeze, + reuse=reuse, + scope=scope) + + +resnet_v1_50.default_image_size = resnet_v1.default_image_size + + +def resnet_v1_101(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + spatial_squeeze=True, + reuse=None, + scope='resnet_v1_101'): + """ResNet-101 model of [1]. See resnet_v1() for arg and return description.""" + blocks = [ + resnet_utils.Block('block1', bottleneck, + [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block('block2', bottleneck, + [(512, 128, 1)] * 3 + [(512, 128, 2)]), + resnet_utils.Block('block3', bottleneck, + [(1024, 256, 1)] * 22 + [(1024, 256, 2)]), + resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3) + ] + return resnet_v1( + inputs, + blocks, + num_classes, + is_training, + global_pool=global_pool, + output_stride=output_stride, + include_root_block=True, + spatial_squeeze=spatial_squeeze, + reuse=reuse, + scope=scope) + + +resnet_v1_101.default_image_size = resnet_v1.default_image_size + + +def resnet_v1_152(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + spatial_squeeze=True, + reuse=None, + scope='resnet_v1_152'): + """ResNet-152 model of [1]. See resnet_v1() for arg and return description.""" + blocks = [ + resnet_utils.Block('block1', bottleneck, + [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block('block2', bottleneck, + [(512, 128, 1)] * 7 + [(512, 128, 2)]), + resnet_utils.Block('block3', bottleneck, + [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), + resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3) + ] + return resnet_v1( + inputs, + blocks, + num_classes, + is_training, + global_pool=global_pool, + output_stride=output_stride, + include_root_block=True, + spatial_squeeze=spatial_squeeze, + reuse=reuse, + scope=scope) + + +resnet_v1_152.default_image_size = resnet_v1.default_image_size + + +def resnet_v1_200(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + spatial_squeeze=True, + reuse=None, + scope='resnet_v1_200'): + """ResNet-200 model of [2]. See resnet_v1() for arg and return description.""" + blocks = [ + resnet_utils.Block('block1', bottleneck, + [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block('block2', bottleneck, + [(512, 128, 1)] * 23 + [(512, 128, 2)]), + resnet_utils.Block('block3', bottleneck, + [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), + resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3) + ] + return resnet_v1( + inputs, + blocks, + num_classes, + is_training, + global_pool=global_pool, + output_stride=output_stride, + include_root_block=True, + spatial_squeeze=spatial_squeeze, + reuse=reuse, + scope=scope) + + +resnet_v1_200.default_image_size = resnet_v1.default_image_size + +if __name__ == '__main__': + input = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name='input') + with slim.arg_scope(resnet_arg_scope()) as sc: + logits = resnet_v1_50(input) diff --git a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py new file mode 100644 index 00000000..e0e240c8 --- /dev/null +++ b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py @@ -0,0 +1,231 @@ +"""Contains building blocks for various versions of Residual Networks. +Residual networks (ResNets) were proposed in: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015 +More variants were introduced in: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016 +We can obtain different ResNet variants by changing the network depth, width, +and form of residual unit. This module implements the infrastructure for +building them. Concrete ResNet units and full ResNet networks are implemented in +the accompanying resnet_v1.py and resnet_v2.py modules. +Compared to https://github.com/KaimingHe/deep-residual-networks, in the current +implementation we subsample the output activations in the last residual unit of +each block, instead of subsampling the input activations in the first residual +unit of each block. The two implementations give identical results but our +implementation is more memory efficient. +""" + +import collections + +import tensorflow as tf +import tf_slim as slim + +if tf.__version__ >= '2.0': + tf = tf.compat.v1 + + +class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])): + """A named tuple describing a ResNet block. + Its parts are: + scope: The scope of the `Block`. + unit_fn: The ResNet unit function which takes as input a `Tensor` and + returns another `Tensor` with the output of the ResNet unit. + args: A list of length equal to the number of units in the `Block`. The list + contains one (depth, depth_bottleneck, stride) tuple for each unit in the + block to serve as argument to unit_fn. + """ + + +def subsample(inputs, factor, scope=None): + """Subsamples the input along the spatial dimensions. + Args: + inputs: A `Tensor` of size [batch, height_in, width_in, channels]. + factor: The subsampling factor. + scope: Optional variable_scope. + Returns: + output: A `Tensor` of size [batch, height_out, width_out, channels] with the + input, either intact (if factor == 1) or subsampled (if factor > 1). + """ + if factor == 1: + return inputs + else: + return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope) + + +def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): + """Strided 2-D convolution with 'SAME' padding. + When stride > 1, then we do explicit zero-padding, followed by conv2d with + 'VALID' padding. + Note that + net = conv2d_same(inputs, num_outputs, 3, stride=stride) + is equivalent to + net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME') + net = subsample(net, factor=stride) + whereas + net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME') + is different when the input's height or width is even, which is why we add the + current function. For more details, see ResnetUtilsTest.testConv2DSameEven(). + Args: + inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. + num_outputs: An integer, the number of output filters. + kernel_size: An int with the kernel_size of the filters. + stride: An integer, the output stride. + rate: An integer, rate for atrous convolution. + scope: Scope. + Returns: + output: A 4-D tensor of size [batch, height_out, width_out, channels] with + the convolution output. + """ + if stride == 1: + return slim.conv2d( + inputs, + num_outputs, + kernel_size, + stride=1, + rate=rate, + padding='SAME', + scope=scope) + else: + kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) + pad_total = kernel_size_effective - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + inputs = tf.pad( + inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) + return slim.conv2d( + inputs, + num_outputs, + kernel_size, + stride=stride, + rate=rate, + padding='VALID', + scope=scope) + + +@slim.add_arg_scope +def stack_blocks_dense(net, + blocks, + output_stride=None, + outputs_collections=None): + """Stacks ResNet `Blocks` and controls output feature density. + First, this function creates scopes for the ResNet in the form of + 'block_name/unit_1', 'block_name/unit_2', etc. + Second, this function allows the user to explicitly control the ResNet + output_stride, which is the ratio of the input to output spatial resolution. + This is useful for dense prediction tasks such as semantic segmentation or + object detection. + Most ResNets consist of 4 ResNet blocks and subsample the activations by a + factor of 2 when transitioning between consecutive ResNet blocks. This results + to a nominal ResNet output_stride equal to 8. If we set the output_stride to + half the nominal network stride (e.g., output_stride=4), then we compute + responses twice. + Control of the output feature density is implemented by atrous convolution. + Args: + net: A `Tensor` of size [batch, height, width, channels]. + blocks: A list of length equal to the number of ResNet `Blocks`. Each + element is a ResNet `Block` object describing the units in the `Block`. + output_stride: If `None`, then the output will be computed at the nominal + network stride. If output_stride is not `None`, it specifies the requested + ratio of input to output spatial resolution, which needs to be equal to + the product of unit strides from the start up to some level of the ResNet. + For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1, + then valid values for the output_stride are 1, 2, 6, 24 or None (which + is equivalent to output_stride=24). + outputs_collections: Collection to add the ResNet block outputs. + Returns: + net: Output tensor with stride equal to the specified output_stride. + Raises: + ValueError: If the target output_stride is not valid. + """ + # The current_stride variable keeps track of the effective stride of the + # activations. This allows us to invoke atrous convolution whenever applying + # the next residual unit would result in the activations having stride larger + # than the target output_stride. + current_stride = 1 + + # The atrous convolution rate parameter. + rate = 1 + + for block in blocks: + with tf.variable_scope(block.scope, 'block', [net]): + for i, unit in enumerate(block.args): + if output_stride is not None and current_stride > output_stride: + raise ValueError( + 'The target output_stride cannot be reached.') + + with tf.variable_scope( + 'unit_%d' % (i + 1), values=[net]) as sc: + unit_depth, unit_depth_bottleneck, unit_stride = unit + # If we have reached the target output_stride, then we need to employ + # atrous convolution with stride=1 and multiply the atrous rate by the + # current unit's stride for use in subsequent layers. + if output_stride is not None and current_stride == output_stride: + net = block.unit_fn( + net, + depth=unit_depth, + depth_bottleneck=unit_depth_bottleneck, + stride=1, + rate=rate) + rate *= unit_stride + + else: + net = block.unit_fn( + net, + depth=unit_depth, + depth_bottleneck=unit_depth_bottleneck, + stride=unit_stride, + rate=1) + current_stride *= unit_stride + net = slim.utils.collect_named_outputs( + outputs_collections, sc.name, net) + + if output_stride is not None and current_stride != output_stride: + raise ValueError('The target output_stride cannot be reached.') + + return net + + +def resnet_arg_scope(weight_decay=0.0001, + batch_norm_decay=0.997, + batch_norm_epsilon=1e-5, + batch_norm_scale=True): + """Defines the default ResNet arg scope. + TODO(gpapan): The batch-normalization related default values above are + appropriate for use in conjunction with the reference ResNet models + released at https://github.com/KaimingHe/deep-residual-networks. When + training ResNets from scratch, they might need to be tuned. + Args: + weight_decay: The weight decay to use for regularizing the model. + batch_norm_decay: The moving average decay when estimating layer activation + statistics in batch normalization. + batch_norm_epsilon: Small constant to prevent division by zero when + normalizing activations by their variance in batch normalization. + batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the + activations in the batch normalization layer. + Returns: + An `arg_scope` to use for the resnet models. + """ + batch_norm_params = { + 'decay': batch_norm_decay, + 'epsilon': batch_norm_epsilon, + 'scale': batch_norm_scale, + 'updates_collections': tf.GraphKeys.UPDATE_OPS, + } + + with slim.arg_scope( + [slim.conv2d], + weights_regularizer=slim.l2_regularizer(weight_decay), + weights_initializer=slim.variance_scaling_initializer(), + activation_fn=tf.nn.relu, + normalizer_fn=slim.batch_norm, + normalizer_params=batch_norm_params): + with slim.arg_scope([slim.batch_norm], **batch_norm_params): + # The following implies padding='SAME' for pool1, which makes feature + # alignment easier for dense prediction tasks. This is also used in + # https://github.com/facebook/fb.resnet.torch. However the accompanying + # code of 'Deep Residual Learning for Image Recognition' uses + # padding='VALID' for pool1. You can switch to that choice by setting + # slim.arg_scope([slim.max_pool2d], padding='VALID'). + with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc: + return arg_sc diff --git a/modelscope/pipelines/cv/ocr_utils/utils.py b/modelscope/pipelines/cv/ocr_utils/utils.py new file mode 100644 index 00000000..be8e3371 --- /dev/null +++ b/modelscope/pipelines/cv/ocr_utils/utils.py @@ -0,0 +1,108 @@ +import cv2 +import numpy as np + + +def rboxes_to_polygons(rboxes): + """ + Convert rboxes to polygons + ARGS + `rboxes`: [n, 5] + RETURN + `polygons`: [n, 8] + """ + + theta = rboxes[:, 4:5] + cxcy = rboxes[:, :2] + half_w = rboxes[:, 2:3] / 2. + half_h = rboxes[:, 3:4] / 2. + v1 = np.hstack([np.cos(theta) * half_w, np.sin(theta) * half_w]) + v2 = np.hstack([-np.sin(theta) * half_h, np.cos(theta) * half_h]) + p1 = cxcy - v1 - v2 + p2 = cxcy + v1 - v2 + p3 = cxcy + v1 + v2 + p4 = cxcy - v1 + v2 + polygons = np.hstack([p1, p2, p3, p4]) + return polygons + + +def cal_width(box): + pd1 = point_dist(box[0], box[1], box[2], box[3]) + pd2 = point_dist(box[4], box[5], box[6], box[7]) + return (pd1 + pd2) / 2 + + +def point_dist(x1, y1, x2, y2): + return np.sqrt((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1)) + + +def draw_polygons(img, polygons): + for p in polygons.tolist(): + p = [int(o) for o in p] + cv2.line(img, (p[0], p[1]), (p[2], p[3]), (0, 255, 0), 1) + cv2.line(img, (p[2], p[3]), (p[4], p[5]), (0, 255, 0), 1) + cv2.line(img, (p[4], p[5]), (p[6], p[7]), (0, 255, 0), 1) + cv2.line(img, (p[6], p[7]), (p[0], p[1]), (0, 255, 0), 1) + return img + + +def nms_python(boxes): + boxes = sorted(boxes, key=lambda x: -x[8]) + nms_flag = [True] * len(boxes) + for i, a in enumerate(boxes): + if not nms_flag[i]: + continue + else: + for j, b in enumerate(boxes): + if not j > i: + continue + if not nms_flag[j]: + continue + score_a = a[8] + score_b = b[8] + rbox_a = polygon2rbox(a[:8]) + rbox_b = polygon2rbox(b[:8]) + if point_in_rbox(rbox_a[:2], rbox_b) or point_in_rbox( + rbox_b[:2], rbox_a): + if score_a > score_b: + nms_flag[j] = False + boxes_nms = [] + for i, box in enumerate(boxes): + if nms_flag[i]: + boxes_nms.append(box) + return boxes_nms + + +def point_in_rbox(c, rbox): + cx0, cy0 = c[0], c[1] + cx1, cy1 = rbox[0], rbox[1] + w, h = rbox[2], rbox[3] + theta = rbox[4] + dist_x = np.abs((cx1 - cx0) * np.cos(theta) + (cy1 - cy0) * np.sin(theta)) + dist_y = np.abs(-(cx1 - cx0) * np.sin(theta) + (cy1 - cy0) * np.cos(theta)) + return ((dist_x < w / 2.0) and (dist_y < h / 2.0)) + + +def polygon2rbox(polygon): + x1, x2, x3, x4 = polygon[0], polygon[2], polygon[4], polygon[6] + y1, y2, y3, y4 = polygon[1], polygon[3], polygon[5], polygon[7] + c_x = (x1 + x2 + x3 + x4) / 4 + c_y = (y1 + y2 + y3 + y4) / 4 + w1 = point_dist(x1, y1, x2, y2) + w2 = point_dist(x3, y3, x4, y4) + h1 = point_line_dist(c_x, c_y, x1, y1, x2, y2) + h2 = point_line_dist(c_x, c_y, x3, y3, x4, y4) + h = h1 + h2 + w = (w1 + w2) / 2 + theta1 = np.arctan2(y2 - y1, x2 - x1) + theta2 = np.arctan2(y3 - y4, x3 - x4) + theta = (theta1 + theta2) / 2.0 + return [c_x, c_y, w, h, theta] + + +def point_line_dist(px, py, x1, y1, x2, y2): + eps = 1e-6 + dx = x2 - x1 + dy = y2 - y1 + div = np.sqrt(dx * dx + dy * dy) + eps + dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div + return dist diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py index 7d9a2c59..b7402b93 100644 --- a/modelscope/pipelines/multi_modal/__init__.py +++ b/modelscope/pipelines/multi_modal/__init__.py @@ -1 +1 @@ -from .image_captioning import ImageCaptionPipeline +from .image_captioning_pipeline import ImageCaptionPipeline diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py new file mode 100644 index 00000000..f0b1f53c --- /dev/null +++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py @@ -0,0 +1,33 @@ +from typing import Any, Dict, Union + +from modelscope.preprocessors import OfaImageCaptionPreprocessor, Preprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from ..base import Model, Pipeline +from ..builder import PIPELINES + +logger = get_logger() + + +@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa') +class ImageCaptionPipeline(Pipeline): + + def __init__(self, + model: Union[Model, str], + preprocessor: [Preprocessor] = None, + **kwargs): + super().__init__() + assert isinstance(model, str) or isinstance(model, Model), \ + 'model must be a single str or OfaForImageCaptioning' + if isinstance(model, str): + pipe_model = Model.from_pretrained(model) + elif isinstance(model, Model): + pipe_model = model + else: + raise NotImplementedError + if preprocessor is None and pipe_model: + preprocessor = OfaImageCaptionPreprocessor(model_dir=model) + super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index cf2f1c8b..c50875fd 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -2,3 +2,4 @@ from .fill_mask_pipeline import * # noqa F403 from .sentence_similarity_pipeline import * # noqa F403 from .sequence_classification_pipeline import * # noqa F403 from .text_generation_pipeline import * # noqa F403 +from .word_segmentation_pipeline import * # noqa F403 diff --git a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py index 44d91756..1b630c10 100644 --- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py +++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py @@ -1,8 +1,5 @@ -import os -import uuid from typing import Any, Dict, Union -import json import numpy as np from modelscope.models.nlp import SbertForSentenceSimilarity diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py index 9d2e4273..1dbe2efd 100644 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py @@ -1,8 +1,5 @@ -import os -import uuid from typing import Any, Dict, Union -import json import numpy as np from modelscope.models.nlp import BertForSequenceClassification diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index ea30a115..881e7ea6 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -1,7 +1,7 @@ from typing import Dict, Optional, Union from modelscope.models import Model -from modelscope.models.nlp import PalmForTextGenerationModel +from modelscope.models.nlp import PalmForTextGeneration from modelscope.preprocessors import TextGenerationPreprocessor from modelscope.utils.constant import Tasks from ..base import Pipeline, Tensor @@ -10,11 +10,11 @@ from ..builder import PIPELINES __all__ = ['TextGenerationPipeline'] -@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm') +@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0') class TextGenerationPipeline(Pipeline): def __init__(self, - model: Union[PalmForTextGenerationModel, str], + model: Union[PalmForTextGeneration, str], preprocessor: Optional[TextGenerationPreprocessor] = None, **kwargs): """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction @@ -23,16 +23,16 @@ class TextGenerationPipeline(Pipeline): model (SequenceClassificationModel): a model instance preprocessor (SequenceClassificationPreprocessor): a preprocessor instance """ - sc_model = model if isinstance( - model, - PalmForTextGenerationModel) else Model.from_pretrained(model) + model = model if isinstance( + model, PalmForTextGeneration) else Model.from_pretrained(model) if preprocessor is None: preprocessor = TextGenerationPreprocessor( - sc_model.model_dir, + model.model_dir, + model.tokenizer, first_sequence='sentence', second_sequence=None) - super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) - self.tokenizer = preprocessor.tokenizer + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.tokenizer = model.tokenizer def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]: """process the prediction results @@ -43,17 +43,20 @@ class TextGenerationPipeline(Pipeline): Returns: Dict[str, str]: the prediction results """ + replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), + ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''), + ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', '')) + replace_tokens_roberta = ((r' +', ' '), ('', ''), ('', + ''), + ('', ''), ('', ''), ('', ' ')) - vocab_size = len(self.tokenizer.vocab) pred_list = inputs['predictions'] pred_ids = pred_list[0][0].cpu().numpy().tolist() - for j in range(len(pred_ids)): - if pred_ids[j] >= vocab_size: - pred_ids[j] = 100 - pred = self.tokenizer.convert_ids_to_tokens(pred_ids) - pred_string = ''.join(pred).replace( - '##', - '').split('[SEP]')[0].replace('[CLS]', - '').replace('[SEP]', - '').replace('[UNK]', '') + pred_string = self.tokenizer.decode(pred_ids) + for _old, _new in replace_tokens_bert: + pred_string = pred_string.replace(_old, _new) + pred_string.strip() + for _old, _new in replace_tokens_roberta: + pred_string = pred_string.replace(_old, _new) + pred_string.strip() return {'text': pred_string} diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py new file mode 100644 index 00000000..1cc08a38 --- /dev/null +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -0,0 +1,69 @@ +from typing import Any, Dict, Optional, Union + +from modelscope.models import Model +from modelscope.models.nlp import StructBertForTokenClassification +from modelscope.preprocessors import TokenClassifcationPreprocessor +from modelscope.utils.constant import Tasks +from ..base import Pipeline, Tensor +from ..builder import PIPELINES + +__all__ = ['WordSegmentationPipeline'] + + +@PIPELINES.register_module( + Tasks.word_segmentation, + module_name=r'structbert-chinese-word-segmentation') +class WordSegmentationPipeline(Pipeline): + + def __init__(self, + model: Union[StructBertForTokenClassification, str], + preprocessor: Optional[TokenClassifcationPreprocessor] = None, + **kwargs): + """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction + + Args: + model (StructBertForTokenClassification): a model instance + preprocessor (TokenClassifcationPreprocessor): a preprocessor instance + """ + model = model if isinstance( + model, + StructBertForTokenClassification) else Model.from_pretrained(model) + if preprocessor is None: + preprocessor = TokenClassifcationPreprocessor(model.model_dir) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.tokenizer = preprocessor.tokenizer + self.config = model.config + self.id2label = self.config.id2label + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): _description_ + + Returns: + Dict[str, str]: the prediction results + """ + + pred_list = inputs['predictions'] + labels = [] + for pre in pred_list: + labels.append(self.id2label[pre]) + labels = labels[1:-1] + chunks = [] + chunk = '' + assert len(inputs['text']) == len(labels) + for token, label in zip(inputs['text'], labels): + if label[0] == 'B' or label[0] == 'I': + chunk += token + else: + chunk += token + chunks.append(chunk) + chunk = '' + if chunk: + chunks.append(chunk) + seg_result = ' '.join(chunks) + rst = { + 'output': seg_result, + } + return rst diff --git a/modelscope/pipelines/outputs.py b/modelscope/pipelines/outputs.py index b545d6eb..6140f726 100644 --- a/modelscope/pipelines/outputs.py +++ b/modelscope/pipelines/outputs.py @@ -54,6 +54,13 @@ TASK_OUTPUTS = { # } Tasks.pose_estimation: ['poses', 'boxes'], + # ocr detection result for single sample + # { + # "det_polygons": np.array with shape [num_text, 8], each box is + # [x1, y1, x2, y2, x3, y3, x4, y4] + # } + Tasks.ocr_detection: ['det_polygons'], + # ============ nlp tasks =================== # text classification result for single sample @@ -75,8 +82,27 @@ TASK_OUTPUTS = { # } Tasks.fill_mask: ['text'], + # word segmentation result for single sample + # { + # "output": "今天 天气 不错 , 适合 出去 游玩" + # } + Tasks.word_segmentation: ['output'], + + # sentence similarity result for single sample + # { + # "labels": "1", + # "scores": 0.9 + # } + Tasks.sentence_similarity: ['scores', 'labels'], + # ============ audio tasks =================== + # audio processed for single file in PCM format + # { + # "output_pcm": np.array with shape(samples,) and dtype float32 + # } + Tasks.speech_signal_process: ['output_pcm'], + # ============ multi-modal tasks =================== # image caption result for single sample diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 81ca1007..50860514 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -1,7 +1,10 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from .audio import LinearAECAndFbank from .base import Preprocessor from .builder import PREPROCESSORS, build_preprocessor from .common import Compose from .image import LoadImage, load_image +from .multi_model import OfaImageCaptionPreprocessor from .nlp import * # noqa F403 +from .text_to_speech import * # noqa F403 diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py new file mode 100644 index 00000000..bb10c89c --- /dev/null +++ b/modelscope/preprocessors/audio.py @@ -0,0 +1,231 @@ +import ctypes +import os +from typing import Any, Dict + +import numpy as np +import scipy.io.wavfile as wav +import torch +from numpy.ctypeslib import ndpointer + +from modelscope.utils.constant import Fields +from .builder import PREPROCESSORS + + +def load_wav(path): + samp_rate, data = wav.read(path) + return np.float32(data), samp_rate + + +def load_library(libaec): + libaec_in_cwd = os.path.join('.', libaec) + if os.path.exists(libaec_in_cwd): + libaec = libaec_in_cwd + mitaec = ctypes.cdll.LoadLibrary(libaec) + fe_process = mitaec.fe_process_inst + fe_process.argtypes = [ + ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), + ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int, + ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), + ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), + ndpointer(ctypes.c_float, flags='C_CONTIGUOUS') + ] + return fe_process + + +def do_linear_aec(fe_process, mic, ref, int16range=True): + mic = np.float32(mic) + ref = np.float32(ref) + if len(mic) > len(ref): + mic = mic[:len(ref)] + out_mic = np.zeros_like(mic) + out_linear = np.zeros_like(mic) + out_echo = np.zeros_like(mic) + out_ref = np.zeros_like(mic) + if int16range: + mic /= 32768 + ref /= 32768 + fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo) + # out_ref not in use here + if int16range: + out_mic *= 32768 + out_linear *= 32768 + out_echo *= 32768 + return out_mic, out_ref, out_linear, out_echo + + +def load_kaldi_feature_transform(filename): + fp = open(filename, 'r') + all_str = fp.read() + pos1 = all_str.find('AddShift') + pos2 = all_str.find('[', pos1) + pos3 = all_str.find(']', pos2) + mean = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ') + pos1 = all_str.find('Rescale') + pos2 = all_str.find('[', pos1) + pos3 = all_str.find(']', pos2) + scale = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ') + fp.close() + return mean, scale + + +class Feature: + r"""Extract feat from one utterance. + """ + + def __init__(self, + fbank_config, + feat_type='spec', + mvn_file=None, + cuda=False): + r""" + + Args: + fbank_config (dict): + feat_type (str): + raw: do nothing + fbank: use kaldi.fbank + spec: Real/Imag + logpow: log(1+|x|^2) + mvn_file (str): the path of data file for mean variance normalization + cuda: + """ + self.fbank_config = fbank_config + self.feat_type = feat_type + self.n_fft = fbank_config['frame_length'] * fbank_config[ + 'sample_frequency'] // 1000 + self.hop_length = fbank_config['frame_shift'] * fbank_config[ + 'sample_frequency'] // 1000 + self.window = torch.hamming_window(self.n_fft, periodic=False) + + self.mvn = False + if mvn_file is not None and os.path.exists(mvn_file): + print(f'loading mvn file: {mvn_file}') + shift, scale = load_kaldi_feature_transform(mvn_file) + self.shift = torch.from_numpy(shift) + self.scale = torch.from_numpy(scale) + self.mvn = True + if cuda: + self.window = self.window.cuda() + if self.mvn: + self.shift = self.shift.cuda() + self.scale = self.scale.cuda() + + def compute(self, utt): + r""" + + Args: + utt: in [-32768, 32767] range + + Returns: + [..., T, F] + """ + if self.feat_type == 'raw': + return utt + elif self.feat_type == 'fbank': + # have to use local import before modelscope framework supoort lazy loading + import torchaudio.compliance.kaldi as kaldi + if len(utt.shape) == 1: + utt = utt.unsqueeze(0) + feat = kaldi.fbank(utt, **self.fbank_config) + elif self.feat_type == 'spec': + spec = torch.stft( + utt / 32768, + self.n_fft, + self.hop_length, + self.n_fft, + self.window, + center=False, + return_complex=True) + feat = torch.cat([spec.real, spec.imag], dim=-2).permute(-1, -2) + elif self.feat_type == 'logpow': + spec = torch.stft( + utt, + self.n_fft, + self.hop_length, + self.n_fft, + self.window, + center=False, + return_complex=True) + abspow = torch.abs(spec)**2 + feat = torch.log(1 + abspow).permute(-1, -2) + return feat + + def normalize(self, feat): + if self.mvn: + feat = feat + self.shift + feat = feat * self.scale + return feat + + +@PREPROCESSORS.register_module(Fields.audio) +class LinearAECAndFbank: + SAMPLE_RATE = 16000 + + def __init__(self, io_config): + self.trunc_length = 7200 * self.SAMPLE_RATE + self.linear_aec_delay = io_config['linear_aec_delay'] + self.feature = Feature(io_config['fbank_config'], + io_config['feat_type'], io_config['mvn']) + self.mitaec = load_library(io_config['mitaec_library']) + self.mask_on_mic = io_config['mask_on'] == 'nearend_mic' + + def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: + """ linear filtering the near end mic and far end audio, then extract the feature + :param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech" + :return: dict with two keys and Tensor values: "base" linear filtered audio,and "feature" + """ + # read files + nearend_mic, fs = load_wav(data['nearend_mic']) + assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}' + farend_speech, fs = load_wav(data['farend_speech']) + assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}' + if 'nearend_speech' in data: + nearend_speech, fs = load_wav(data['nearend_speech']) + assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}' + else: + nearend_speech = np.zeros_like(nearend_mic) + + out_mic, out_ref, out_linear, out_echo = do_linear_aec( + self.mitaec, nearend_mic, farend_speech) + # fix 20ms linear aec delay by delaying the target speech + extra_zeros = np.zeros([int(self.linear_aec_delay * fs)]) + nearend_speech = np.concatenate([extra_zeros, nearend_speech]) + # truncate files to the same length + flen = min( + len(out_mic), len(out_ref), len(out_linear), len(out_echo), + len(nearend_speech)) + fstart = 0 + flen = min(flen, self.trunc_length) + nearend_mic, out_ref, out_linear, out_echo, nearend_speech = ( + out_mic[fstart:flen], out_ref[fstart:flen], + out_linear[fstart:flen], out_echo[fstart:flen], + nearend_speech[fstart:flen]) + + # extract features (frames, [mic, linear, ref, aes?]) + feat = torch.FloatTensor() + + nearend_mic = torch.from_numpy(np.float32(nearend_mic)) + fbank_nearend_mic = self.feature.compute(nearend_mic) + feat = torch.cat([feat, fbank_nearend_mic], dim=1) + + out_linear = torch.from_numpy(np.float32(out_linear)) + fbank_out_linear = self.feature.compute(out_linear) + feat = torch.cat([feat, fbank_out_linear], dim=1) + + out_echo = torch.from_numpy(np.float32(out_echo)) + fbank_out_echo = self.feature.compute(out_echo) + feat = torch.cat([feat, fbank_out_echo], dim=1) + + # feature transform + feat = self.feature.normalize(feat) + + # prepare target + if nearend_speech is not None: + nearend_speech = torch.from_numpy(np.float32(nearend_speech)) + + if self.mask_on_mic: + base = nearend_mic + else: + base = out_linear + out_data = {'base': base, 'target': nearend_speech, 'feature': feat} + return out_data diff --git a/modelscope/pipelines/multi_modal/image_captioning.py b/modelscope/preprocessors/multi_model.py similarity index 57% rename from modelscope/pipelines/multi_modal/image_captioning.py rename to modelscope/preprocessors/multi_model.py index 3e5f49d0..de211611 100644 --- a/modelscope/pipelines/multi_modal/image_captioning.py +++ b/modelscope/preprocessors/multi_model.py @@ -1,32 +1,50 @@ -from typing import Any, Dict +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +from typing import Any, Dict, Union import numpy as np import torch +from maas_hub.snapshot_download import snapshot_download from PIL import Image -from modelscope.pipelines.base import Input -from modelscope.preprocessors import load_image -from modelscope.utils.constant import Tasks -from modelscope.utils.logger import get_logger -from ..base import Pipeline -from ..builder import PIPELINES +from modelscope.utils.constant import Fields, ModelFile +from modelscope.utils.hub import get_model_cache_dir +from modelscope.utils.type_assert import type_assert +from .base import Preprocessor +from .builder import PREPROCESSORS +from .image import load_image -logger = get_logger() +__all__ = [ + 'OfaImageCaptionPreprocessor', +] -@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa') -class ImageCaptionPipeline(Pipeline): - # TODO: refine using modelhub - def __init__(self, model: str, bpe_dir: str): - super().__init__() - # turn on cuda if GPU is available +@PREPROCESSORS.register_module( + Fields.multi_modal, module_name=r'ofa-image-caption') +class OfaImageCaptionPreprocessor(Preprocessor): + + def __init__(self, model_dir: str, *args, **kwargs): + """preprocess the data via the vocab.txt from the `model_dir` path + + Args: + model_dir (str): model path + """ + super().__init__(*args, **kwargs) + + if osp.exists(model_dir): + local_model_dir = model_dir + else: + cache_path = get_model_cache_dir(model_dir) + local_model_dir = cache_path if osp.exists( + cache_path) else snapshot_download(model_dir) + local_model = osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE) + bpe_dir = local_model_dir + from fairseq import checkpoint_utils, tasks, utils from ofa.tasks.mm_tasks import CaptionTask tasks.register_task('caption', CaptionTask) - use_cuda = False - # use fp16 only when GPU is available - use_fp16 = False + overrides = { 'bpe_dir': bpe_dir, 'eval_cider': False, @@ -35,21 +53,9 @@ class ImageCaptionPipeline(Pipeline): 'no_repeat_ngram_size': 3, 'seed': 7 } - models, cfg, task = checkpoint_utils.load_model_ensemble_and_task( - utils.split_paths(model), arg_overrides=overrides) - - # Move models to GPU - for model in models: - model.eval() - if use_cuda: - model.cuda() - if use_fp16: - model.half() - model.prepare_for_inference_(cfg) - self.models = models - # Initialize generator - self.generator = task.build_generator(models, cfg.generation) - + model, cfg, task = checkpoint_utils.load_model_ensemble_and_task( + utils.split_paths(local_model), arg_overrides=overrides) + del model # Initialize transform from torchvision import transforms mean = [0.5, 0.5, 0.5] @@ -69,7 +75,8 @@ class ImageCaptionPipeline(Pipeline): self.eos_item = torch.LongTensor([task.src_dict.eos()]) self.pad_idx = task.src_dict.pad() - def preprocess(self, input: Input) -> Dict[str, Any]: + @type_assert(object, (str, tuple)) + def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]: def encode_text(text, length=None, append_bos=False, append_eos=False): s = self.task.tgt_dict.encode_line( @@ -88,7 +95,7 @@ class ImageCaptionPipeline(Pipeline): patch_image = self.patch_resize_transform(input).unsqueeze(0) else: patch_image = self.patch_resize_transform( - load_image(input)).unsqueeze(0) + load_image(data)).unsqueeze(0) patch_mask = torch.tensor([True]) text = 'what does the image describe?' src_text = encode_text( @@ -105,17 +112,3 @@ class ImageCaptionPipeline(Pipeline): } } return sample - - def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: - from ofa.utils.eval_utils import eval_caption - - results, _ = eval_caption(self.task, self.generator, self.models, - input) - return { - 'image_id': results[0]['image_id'], - 'caption': results[0]['caption'] - } - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - # What should we do here ? - return inputs diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 20c4877b..c2f72292 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -12,7 +12,8 @@ from .builder import PREPROCESSORS __all__ = [ 'Tokenize', 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', 'FillMaskPreprocessor' + 'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor', + 'FillMaskPreprocessor' ] @@ -53,12 +54,12 @@ class SequenceClassificationPreprocessor(Preprocessor): self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) print(f'this is the tokenzier {self.tokenizer}') - @type_assert(object, (str, tuple)) - def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]: + @type_assert(object, (str, tuple, Dict)) + def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: """process the raw input data Args: - data (str or tuple): + data (str or tuple, Dict): sentence1 (str): a sentence Example: 'you are so handsome.' @@ -70,22 +71,31 @@ class SequenceClassificationPreprocessor(Preprocessor): sentence2 (str): a sentence Example: 'you are so beautiful.' + or + {field1: field_value1, field2: field_value2} + field1 (str): field name, default 'first_sequence' + field_value1 (str): a sentence + Example: + 'you are so handsome.' + + field2 (str): field name, default 'second_sequence' + field_value2 (str): a sentence + Example: + 'you are so beautiful.' Returns: Dict[str, Any]: the preprocessed data """ - - if not isinstance(data, tuple): - data = ( - data, - None, - ) - - sentence1, sentence2 = data - new_data = { - self.first_sequence: sentence1, - self.second_sequence: sentence2 - } + if isinstance(data, str): + new_data = {self.first_sequence: data} + elif isinstance(data, tuple): + sentence1, sentence2 = data + new_data = { + self.first_sequence: sentence1, + self.second_sequence: sentence2 + } + else: + new_data = data # preprocess the data for the model input @@ -115,17 +125,15 @@ class SequenceClassificationPreprocessor(Preprocessor): return rst -@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm') +@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0') class TextGenerationPreprocessor(Preprocessor): - def __init__(self, model_dir: str, *args, **kwargs): + def __init__(self, model_dir: str, tokenizer, *args, **kwargs): """preprocess the data using the vocab.txt from the `model_dir` path Args: model_dir (str): model path """ - from sofa import PalmTokenizer - super().__init__(*args, **kwargs) self.model_dir: str = model_dir @@ -134,7 +142,7 @@ class TextGenerationPreprocessor(Preprocessor): self.second_sequence: str = kwargs.pop('second_sequence', 'second_sequence') self.sequence_length: int = kwargs.pop('sequence_length', 128) - self.tokenizer = PalmTokenizer.from_pretrained(model_dir) + self.tokenizer = tokenizer @type_assert(object, str) def __call__(self, data: str) -> Dict[str, Any]: @@ -153,7 +161,7 @@ class TextGenerationPreprocessor(Preprocessor): new_data = {self.first_sequence: data} # preprocess the data for the model input - rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []} + rst = {'input_ids': [], 'attention_mask': []} max_seq_length = self.sequence_length @@ -225,3 +233,51 @@ class FillMaskPreprocessor(Preprocessor): rst['token_type_ids'].append(feature['token_type_ids']) return {k: torch.tensor(v) for k, v in rst.items()} + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=r'bert-token-classification') +class TokenClassifcationPreprocessor(Preprocessor): + + def __init__(self, model_dir: str, *args, **kwargs): + """preprocess the data via the vocab.txt from the `model_dir` path + + Args: + model_dir (str): model path + """ + + super().__init__(*args, **kwargs) + + from sofa import SbertTokenizer + self.model_dir: str = model_dir + self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir) + + @type_assert(object, str) + def __call__(self, data: str) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str): a sentence + Example: + 'you are so handsome.' + + Returns: + Dict[str, Any]: the preprocessed data + """ + # preprocess the data for the model input + + text = data.replace(' ', '').strip() + tokens = [] + for token in text: + token = self.tokenizer.tokenize(token) + tokens.extend(token) + input_ids = self.tokenizer.convert_tokens_to_ids(tokens) + input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids) + attention_mask = [1] * len(input_ids) + token_type_ids = [0] * len(input_ids) + return { + 'text': text, + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'token_type_ids': token_type_ids + } diff --git a/modelscope/preprocessors/text_to_speech.py b/modelscope/preprocessors/text_to_speech.py new file mode 100644 index 00000000..8b8dae14 --- /dev/null +++ b/modelscope/preprocessors/text_to_speech.py @@ -0,0 +1,51 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import io +from typing import Any, Dict, Union + +from modelscope.fileio import File +from modelscope.models.audio.tts.frontend import GenericTtsFrontend +from modelscope.models.base import Model +from modelscope.utils.audio.tts_exceptions import * # noqa F403 +from modelscope.utils.constant import Fields +from .base import Preprocessor +from .builder import PREPROCESSORS + +__all__ = ['TextToTacotronSymbols', 'text_to_tacotron_symbols'] + + +@PREPROCESSORS.register_module( + Fields.audio, module_name=r'text_to_tacotron_symbols') +class TextToTacotronSymbols(Preprocessor): + """extract tacotron symbols from text. + + Args: + res_path (str): TTS frontend resource url + lang_type (str): language type, valid values are "pinyin" and "chenmix" + """ + + def __init__(self, model_name, lang_type='pinyin'): + self._frontend_model = Model.from_pretrained( + model_name, lang_type=lang_type) + assert self._frontend_model is not None, 'load model from pretained failed' + + def __call__(self, data: str) -> Dict[str, Any]: + """Call functions to load text and get tacotron symbols. + + Args: + input (str): text with utf-8 + Returns: + symbos (list[str]): texts in tacotron symbols format. + """ + return self._frontend_model.forward(data) + + +def text_to_tacotron_symbols(text='', path='./', lang='pinyin'): + """ simple interface to transform text to tacotron symbols + + Args: + text (str): input text + path (str): resource path + lang (str): language type from one of "pinyin" and "chenmix" + """ + transform = TextToTacotronSymbols(path, lang) + return transform(text) diff --git a/modelscope/pydatasets/config.py b/modelscope/pydatasets/config.py new file mode 100644 index 00000000..e916b3ec --- /dev/null +++ b/modelscope/pydatasets/config.py @@ -0,0 +1,22 @@ +import os +from pathlib import Path + +# Cache location +DEFAULT_CACHE_HOME = '~/.cache' +CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME) +DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub') +MS_CACHE_HOME = os.path.expanduser( + os.getenv('MS_CACHE_HOME', DEFAULT_MS_CACHE_HOME)) + +DEFAULT_MS_DATASETS_CACHE = os.path.join(MS_CACHE_HOME, 'datasets') +MS_DATASETS_CACHE = Path( + os.getenv('MS_DATASETS_CACHE', DEFAULT_MS_DATASETS_CACHE)) + +DOWNLOADED_DATASETS_DIR = 'downloads' +DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE, + DOWNLOADED_DATASETS_DIR) +DOWNLOADED_DATASETS_PATH = Path( + os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH)) + +MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT', + 'http://101.201.119.157:31752') diff --git a/modelscope/pydatasets/py_dataset.py b/modelscope/pydatasets/py_dataset.py index 78aedaa0..49137253 100644 --- a/modelscope/pydatasets/py_dataset.py +++ b/modelscope/pydatasets/py_dataset.py @@ -1,64 +1,81 @@ -from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, - Union) +import os +from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional, + Sequence, Union) -from datasets import Dataset, load_dataset +import numpy as np +from datasets import Dataset +from datasets import load_dataset as hf_load_dataset +from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE +from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES +from datasets.utils.file_utils import (is_relative_path, + relative_to_absolute_path) +from modelscope.pydatasets.config import MS_DATASETS_CACHE +from modelscope.pydatasets.utils.ms_api import MsApi from modelscope.utils.constant import Hubs from modelscope.utils.logger import get_logger logger = get_logger() +def format_list(para) -> List: + if para is None: + para = [] + elif isinstance(para, str): + para = [para] + elif len(set(para)) < len(para): + raise ValueError(f'List columns contains duplicates: {para}') + return para + + class PyDataset: _hf_ds = None # holds the underlying HuggingFace Dataset """A PyDataset backed by hugging face Dataset.""" - def __init__(self, hf_ds: Dataset): + def __init__(self, hf_ds: Dataset, target: Optional[str] = None): self._hf_ds = hf_ds - self.target = None + self.target = target def __iter__(self): - if isinstance(self._hf_ds, Dataset): - for item in self._hf_ds: - if self.target is not None: - yield item[self.target] - else: - yield item - else: - for ds in self._hf_ds.values(): - for item in ds: - if self.target is not None: - yield item[self.target] - else: - yield item + for item in self._hf_ds: + if self.target is not None: + yield item[self.target] + else: + yield item + + def __getitem__(self, key): + return self._hf_ds[key] @classmethod def from_hf_dataset(cls, hf_ds: Dataset, - target: str = None) -> 'PyDataset': - dataset = cls(hf_ds) - dataset.target = target - return dataset + target: str = None) -> Union[dict, 'PyDataset']: + if isinstance(hf_ds, Dataset): + return cls(hf_ds, target) + if len(hf_ds.keys()) == 1: + return cls(next(iter(hf_ds.values())), target) + return {k: cls(v, target) for k, v in hf_ds.items()} @staticmethod - def load(path: Union[str, list], - target: Optional[str] = None, - version: Optional[str] = None, - name: Optional[str] = None, - split: Optional[str] = None, - data_dir: Optional[str] = None, - data_files: Optional[Union[str, Sequence[str], - Mapping[str, - Union[str, - Sequence[str]]]]] = None, - hub: Optional[Hubs] = None) -> 'PyDataset': + def load( + dataset_name: Union[str, list], + target: Optional[str] = None, + version: Optional[str] = None, + hub: Optional[Hubs] = Hubs.modelscope, + subset_name: Optional[str] = None, + split: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], + Mapping[str, Union[str, + Sequence[str]]]]] = None + ) -> Union[dict, 'PyDataset']: """Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. Args: - path (str): Path or name of the dataset. + dataset_name (str): Path or name of the dataset. target (str, optional): Name of the column to output. version (str, optional): Version of the dataset script to load: - name (str, optional): Defining the subset_name of the dataset. + subset_name (str, optional): Defining the subset_name of the dataset. data_dir (str, optional): Defining the data_dir of the dataset configuration. I data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s). split (str, optional): Which split of the data to load. @@ -67,53 +84,302 @@ class PyDataset: Returns: PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset. """ - if Hubs.modelscope == hub: - # TODO: parse data meta information from modelscope hub - # and possibly download data files to local (and update path) - print('getting data from modelscope hub') - if isinstance(path, str): - dataset = load_dataset( - path, - name=name, + if hub == Hubs.huggingface: + dataset = hf_load_dataset( + dataset_name, + name=subset_name, revision=version, split=split, data_dir=data_dir, data_files=data_files) - elif isinstance(path, list): + return PyDataset.from_hf_dataset(dataset, target=target) + else: + return PyDataset._load_ms_dataset( + dataset_name, + target=target, + subset_name=subset_name, + version=version, + split=split, + data_dir=data_dir, + data_files=data_files) + + @staticmethod + def _load_ms_dataset( + dataset_name: Union[str, list], + target: Optional[str] = None, + version: Optional[str] = None, + subset_name: Optional[str] = None, + split: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], + Mapping[str, Union[str, + Sequence[str]]]]] = None + ) -> Union[dict, 'PyDataset']: + if isinstance(dataset_name, str): + use_hf = False + if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ + (os.path.isfile(dataset_name) and dataset_name.endswith('.py')): + use_hf = True + elif is_relative_path(dataset_name): + ms_api = MsApi() + dataset_scripts = ms_api.fetch_dataset_scripts( + dataset_name, version) + if 'py' in dataset_scripts: # dataset copied from hf datasets + dataset_name = dataset_scripts['py'][0] + use_hf = True + else: + raise FileNotFoundError( + f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " + f'or any data file in the same directory.') + + if use_hf: + dataset = hf_load_dataset( + dataset_name, + name=subset_name, + revision=version, + split=split, + data_dir=data_dir, + data_files=data_files, + cache_dir=MS_DATASETS_CACHE) + else: + # TODO load from ms datahub + raise NotImplementedError( + f'Dataset {dataset_name} load from modelscope datahub to be implemented in ' + f'the future') + elif isinstance(dataset_name, list): if target is None: target = 'target' - dataset = Dataset.from_dict({target: [p] for p in path}) + dataset = Dataset.from_dict({target: dataset_name}) else: raise TypeError('path must be a str or a list, but got' - f' {type(path)}') + f' {type(dataset_name)}') return PyDataset.from_hf_dataset(dataset, target=target) + def to_torch_dataset_with_processors( + self, + preprocessors: Union[Callable, List[Callable]], + columns: Union[str, List[str]] = None, + ): + preprocessor_list = preprocessors if isinstance( + preprocessors, list) else [preprocessors] + + columns = format_list(columns) + + columns = [ + key for key in self._hf_ds.features.keys() if key in columns + ] + sample = next(iter(self._hf_ds)) + + sample_res = {k: np.array(sample[k]) for k in columns} + for processor in preprocessor_list: + sample_res.update( + {k: np.array(v) + for k, v in processor(sample).items()}) + + def is_numpy_number(value): + return np.issubdtype(value.dtype, np.integer) or np.issubdtype( + value.dtype, np.floating) + + retained_columns = [] + for k in sample_res.keys(): + if not is_numpy_number(sample_res[k]): + logger.warning( + f'Data of column {k} is non-numeric, will be removed') + continue + retained_columns.append(k) + + import torch + + class MsIterableDataset(torch.utils.data.IterableDataset): + + def __init__(self, dataset: Iterable): + super(MsIterableDataset).__init__() + self.dataset = dataset + + def __iter__(self): + for item_dict in self.dataset: + res = { + k: np.array(item_dict[k]) + for k in columns if k in retained_columns + } + for preprocessor in preprocessor_list: + res.update({ + k: np.array(v) + for k, v in preprocessor(item_dict).items() + if k in retained_columns + }) + yield res + + return MsIterableDataset(self._hf_ds) + def to_torch_dataset( self, columns: Union[str, List[str]] = None, - output_all_columns: bool = False, + preprocessors: Union[Callable, List[Callable]] = None, **format_kwargs, ): - self._hf_ds.reset_format() - self._hf_ds.set_format( - type='torch', - columns=columns, - output_all_columns=output_all_columns, - format_kwargs=format_kwargs) - return self._hf_ds + """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to + torch.utils.data.DataLoader. + + Args: + preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process + every sample of the dataset. The output type of processors is dict, and each numeric field of the dict + will be used as a field of torch.utils.data.Dataset. + columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the + preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, + the output fields of processors will also be added. + format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. + + Returns: + :class:`tf.data.Dataset` + + """ + if not TORCH_AVAILABLE: + raise ImportError( + 'The function to_torch_dataset requires pytorch to be installed' + ) + if preprocessors is not None: + return self.to_torch_dataset_with_processors(preprocessors) + else: + self._hf_ds.reset_format() + self._hf_ds.set_format( + type='torch', columns=columns, format_kwargs=format_kwargs) + return self._hf_ds + + def to_tf_dataset_with_processors( + self, + batch_size: int, + shuffle: bool, + preprocessors: Union[Callable, List[Callable]], + drop_remainder: bool = None, + prefetch: bool = True, + label_cols: Union[str, List[str]] = None, + columns: Union[str, List[str]] = None, + ): + preprocessor_list = preprocessors if isinstance( + preprocessors, list) else [preprocessors] + + label_cols = format_list(label_cols) + columns = format_list(columns) + cols_to_retain = list(set(label_cols + columns)) + retained_columns = [ + key for key in self._hf_ds.features.keys() if key in cols_to_retain + ] + import tensorflow as tf + tf_dataset = tf.data.Dataset.from_tensor_slices( + np.arange(len(self._hf_ds), dtype=np.int64)) + if shuffle: + tf_dataset = tf_dataset.shuffle(buffer_size=len(self._hf_ds)) + + def func(i, return_dict=False): + i = int(i) + res = {k: np.array(self._hf_ds[i][k]) for k in retained_columns} + for preprocessor in preprocessor_list: + # TODO preprocessor output may have the same key + res.update({ + k: np.array(v) + for k, v in preprocessor(self._hf_ds[i]).items() + }) + if return_dict: + return res + return tuple(list(res.values())) + + sample_res = func(0, True) + + @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)]) + def fetch_function(i): + output = tf.numpy_function( + func, + inp=[i], + Tout=[ + tf.dtypes.as_dtype(val.dtype) + for val in sample_res.values() + ], + ) + return {key: output[i] for i, key in enumerate(sample_res)} + + tf_dataset = tf_dataset.map( + fetch_function, num_parallel_calls=tf.data.AUTOTUNE) + if label_cols: + + def split_features_and_labels(input_batch): + labels = { + key: tensor + for key, tensor in input_batch.items() if key in label_cols + } + if len(input_batch) == 1: + input_batch = next(iter(input_batch.values())) + if len(labels) == 1: + labels = next(iter(labels.values())) + return input_batch, labels + + tf_dataset = tf_dataset.map(split_features_and_labels) + + elif len(columns) == 1: + tf_dataset = tf_dataset.map(lambda x: next(iter(x.values()))) + if batch_size > 1: + tf_dataset = tf_dataset.batch( + batch_size, drop_remainder=drop_remainder) + + if prefetch: + tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE) + return tf_dataset def to_tf_dataset( self, - columns: Union[str, List[str]], batch_size: int, shuffle: bool, - collate_fn: Callable, + preprocessors: Union[Callable, List[Callable]] = None, + columns: Union[str, List[str]] = None, + collate_fn: Callable = None, drop_remainder: bool = None, collate_fn_args: Dict[str, Any] = None, label_cols: Union[str, List[str]] = None, - dummy_labels: bool = False, prefetch: bool = True, ): + """Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like + model.fit() or model.predict(). + + Args: + batch_size (int): Number of samples in a single batch. + shuffle(bool): Shuffle the dataset order. + preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process + every sample of the dataset. The output type of processors is dict, and each field of the dict will be + used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn` + shouldn't be None. + columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None, + the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of + processors will also be added. + collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If + the `preprocessors` is None, the `collate_fn` shouldn't be None. + drop_remainder(bool, default None): Drop the last incomplete batch when loading. + collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`. + label_cols (str or List[str], defalut None): Dataset column(s) to load as labels. + prefetch (bool, default True): Prefetch data. + + Returns: + :class:`tf.data.Dataset` + + """ + if not TF_AVAILABLE: + raise ImportError( + 'The function to_tf_dataset requires Tensorflow to be installed.' + ) + if preprocessors is not None: + return self.to_tf_dataset_with_processors( + batch_size, + shuffle, + preprocessors, + drop_remainder=drop_remainder, + prefetch=prefetch, + label_cols=label_cols, + columns=columns) + + if collate_fn is None: + logger.error( + 'The `preprocessors` and the `collate_fn` should`t be both None.' + ) + return None self._hf_ds.reset_format() return self._hf_ds.to_tf_dataset( columns, @@ -123,7 +389,6 @@ class PyDataset: drop_remainder=drop_remainder, collate_fn_args=collate_fn_args, label_cols=label_cols, - dummy_labels=dummy_labels, prefetch=prefetch) def to_hf_dataset(self) -> Dataset: diff --git a/modelscope/pydatasets/utils/__init__.py b/modelscope/pydatasets/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/pydatasets/utils/ms_api.py b/modelscope/pydatasets/utils/ms_api.py new file mode 100644 index 00000000..04052cc4 --- /dev/null +++ b/modelscope/pydatasets/utils/ms_api.py @@ -0,0 +1,66 @@ +import os +from collections import defaultdict +from typing import Optional + +import requests + +from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH, + MS_HUB_ENDPOINT) +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +class MsApi: + + def __init__(self, endpoint=MS_HUB_ENDPOINT): + self.endpoint = endpoint + + def list_datasets(self): + path = f'{self.endpoint}/api/v1/datasets' + headers = None + params = {} + r = requests.get(path, params=params, headers=headers) + r.raise_for_status() + dataset_list = r.json()['Data'] + return [x['Name'] for x in dataset_list] + + def fetch_dataset_scripts(self, + dataset_name: str, + version: Optional[str] = 'master', + force_download=False): + datahub_url = f'{self.endpoint}/api/v1/datasets?Query={dataset_name}' + r = requests.get(datahub_url) + r.raise_for_status() + dataset_list = r.json()['Data'] + if len(dataset_list) == 0: + return None + dataset_id = dataset_list[0]['Id'] + version = version or 'master' + datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}' + r = requests.get(datahub_url) + r.raise_for_status() + file_list = r.json()['Data']['Files'] + cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name, + version) + os.makedirs(cache_dir, exist_ok=True) + local_paths = defaultdict(list) + for file_info in file_list: + file_path = file_info['Path'] + if file_path.endswith('.py'): + datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \ + f'Revision={version}&Path={file_path}' + r = requests.get(datahub_url) + r.raise_for_status() + content = r.json()['Data']['Content'] + local_path = os.path.join(cache_dir, file_path) + if os.path.exists(local_path) and not force_download: + logger.warning( + f"Reusing dataset {dataset_name}'s python file ({local_path})" + ) + local_paths['py'].append(local_path) + continue + with open(local_path, 'w') as f: + f.writelines(content) + local_paths['py'].append(local_path) + return local_paths diff --git a/modelscope/utils/audio/__init__.py b/modelscope/utils/audio/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/utils/audio/tts_exceptions.py b/modelscope/utils/audio/tts_exceptions.py new file mode 100644 index 00000000..1ca731c3 --- /dev/null +++ b/modelscope/utils/audio/tts_exceptions.py @@ -0,0 +1,42 @@ +""" +Define TTS exceptions +""" + + +class TtsException(Exception): + """ + TTS exception class. + """ + pass + + +class TtsFrontendException(TtsException): + """ + TTS frontend module level exceptions. + """ + pass + + +class TtsFrontendInitializeFailedException(TtsFrontendException): + """ + If tts frontend resource is invalid or not exist, this exception will be raised. + """ + pass + + +class TtsFrontendLanguageTypeInvalidException(TtsFrontendException): + """ + If language type is invalid, this exception will be raised. + """ + + +class TtsVocoderException(TtsException): + """ + Vocoder exception + """ + + +class TtsVocoderMelspecShapeMismatchException(TtsVocoderException): + """ + If vocoder's input melspec shape mismatch, this exception will be raised. + """ diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 3bc4548b..d361b93c 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -28,8 +28,10 @@ class Tasks(object): image_editing = 'image-editing' image_generation = 'image-generation' image_matting = 'image-matting' + ocr_detection = 'ocr-detection' # nlp tasks + word_segmentation = 'word-segmentation' sentiment_analysis = 'sentiment-analysis' sentence_similarity = 'sentence-similarity' text_classification = 'text-classification' diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py index 319e54cb..b26b899d 100644 --- a/modelscope/utils/registry.py +++ b/modelscope/utils/registry.py @@ -67,7 +67,6 @@ class Registry(object): if module_name in self._modules[group_key]: raise KeyError(f'{module_name} is already registered in ' f'{self._name}[{group_key}]') - self._modules[group_key][module_name] = module_cls module_cls.group_key = group_key diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py index c8ea0442..95e63dba 100644 --- a/modelscope/utils/test_utils.py +++ b/modelscope/utils/test_utils.py @@ -2,6 +2,9 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os +import unittest + +from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE TEST_LEVEL = 2 TEST_LEVEL_STR = 'TEST_LEVEL' @@ -15,6 +18,18 @@ def test_level(): return TEST_LEVEL +def require_tf(test_case): + if not TF_AVAILABLE: + test_case = unittest.skip('test requires TensorFlow')(test_case) + return test_case + + +def require_torch(test_case): + if not TORCH_AVAILABLE: + test_case = unittest.skip('test requires PyTorch')(test_case) + return test_case + + def set_test_level(level: int): global TEST_LEVEL TEST_LEVEL = level diff --git a/requirements.txt b/requirements.txt index 39eb5e23..b9b4a1c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ -r requirements/pipeline.txt -r requirements/multi-modal.txt -r requirements/nlp.txt +-r requirements/audio.txt -r requirements/cv.txt diff --git a/requirements/audio.txt b/requirements/audio.txt new file mode 100644 index 00000000..140836a8 --- /dev/null +++ b/requirements/audio.txt @@ -0,0 +1,26 @@ +#tts +h5py==2.10.0 +#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp36-cp36m-linux_x86_64.whl +https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp37-cp37m-linux_x86_64.whl +https://swap.oss-cn-hangzhou.aliyuncs.com/Jiaqi%2Fmaas%2Ftts%2Frequirements%2Fpytorch_wavelets-1.3.0-py3-none-any.whl?Expires=1685688388&OSSAccessKeyId=LTAI4Ffebq4d9jTVDwiSbY4L&Signature=jcQbg5EZ%2Bdys3%2F4BRn3srrKLdIg%3D +#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp38-cp38-linux_x86_64.whl +#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp39-cp39-linux_x86_64.whl +inflect +keras==2.2.4 +librosa +lxml +matplotlib +nara_wpe +numpy==1.18.* +protobuf==3.20.* +ptflops +PyWavelets>=1.0.0 +scikit-learn==0.23.2 +sox +tensorboard +tensorflow==1.15.* +torch==1.10.* +torchaudio +torchvision +tqdm +unidecode diff --git a/requirements/cv.txt b/requirements/cv.txt index 66799b76..5bec8ba7 100644 --- a/requirements/cv.txt +++ b/requirements/cv.txt @@ -1 +1,2 @@ easydict +tf_slim diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 43684a06..e97352aa 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,12 +1,13 @@ addict datasets easydict -https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.2.dev0-py3-none-any.whl +https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl numpy opencv-python-headless Pillow>=6.2.0 pyyaml requests +scipy tokenizers<=0.10.3 transformers<=4.16.2 yapf diff --git a/setup.cfg b/setup.cfg index 0b929b04..16c10cae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,6 +11,7 @@ default_section = THIRDPARTY BASED_ON_STYLE = pep8 BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true +SPLIT_BEFORE_ARITHMETIC_OPERATOR = true [codespell] skip = *.ipynb @@ -20,5 +21,5 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids [flake8] select = B,C,E,F,P,T4,W,B9 max-line-length = 120 -ignore = F401,F821 +ignore = F401,F821,W503 exclude = docs/src,*.pyi,.git diff --git a/tests/pipelines/test_base.py b/tests/pipelines/test_base.py index 73aebfdf..c642ed4b 100644 --- a/tests/pipelines/test_base.py +++ b/tests/pipelines/test_base.py @@ -80,8 +80,7 @@ class CustomPipelineTest(unittest.TestCase): pipe2 = pipeline(dummy_task) self.assertTrue(type(pipe) is type(pipe2)) - img_url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.' \ - 'aliyuncs.com/data/test/images/image1.jpg' + img_url = 'data/test/images/image1.jpg' output = pipe(img_url) self.assertEqual(output['filename'], img_url) self.assertEqual(output['output_png'].shape, (318, 512, 3)) diff --git a/tests/pipelines/test_image_captioning.py b/tests/pipelines/test_image_captioning.py index 4fac4658..5fa6ff49 100644 --- a/tests/pipelines/test_image_captioning.py +++ b/tests/pipelines/test_image_captioning.py @@ -1,10 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import os -import tempfile import unittest -from modelscope.fileio import File from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level @@ -12,25 +9,13 @@ from modelscope.utils.test_utils import test_level class ImageCaptionTest(unittest.TestCase): - @unittest.skip('skip before model is restored in model hub') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run(self): - model = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_large_best_clean.pt' - - os.system( - 'wget https://jirenmr.oss-cn-zhangjiakou.aliyuncs.com/ofa/BPE.zip' - ) - os.system('unzip BPE.zip') - bpe_dir = './BPE' - - with tempfile.NamedTemporaryFile('wb', suffix='.pb') as ofile: - ofile.write(File.read(model)) - img_captioning = pipeline( - Tasks.image_captioning, model=ofile.name, bpe_dir=bpe_dir) - - result = img_captioning( - 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' - ) - print(result['caption']) + img_captioning = pipeline( + Tasks.image_captioning, + model='damo/ofa_image-caption_coco_large_en') + result = img_captioning('data/test/images/image_captioning.png') + print(result['caption']) if __name__ == '__main__': diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py index ba5d05ad..e557ba86 100644 --- a/tests/pipelines/test_image_matting.py +++ b/tests/pipelines/test_image_matting.py @@ -17,7 +17,7 @@ from modelscope.utils.test_utils import test_level class ImageMattingTest(unittest.TestCase): def setUp(self) -> None: - self.model_id = 'damo/cv_unet_image-matting_damo' + self.model_id = 'damo/cv_unet_image-matting' # switch to False if downloading everytime is not desired purge_cache = True if purge_cache: @@ -34,16 +34,12 @@ class ImageMattingTest(unittest.TestCase): ofile.write(File.read(model_path)) img_matting = pipeline(Tasks.image_matting, model=tmp_dir) - result = img_matting( - 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' - ) + result = img_matting('data/test/images/image_matting.png') cv2.imwrite('result.png', result['output_png']) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_dataset(self): - input_location = [ - 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' - ] + input_location = ['data/test/images/image_matting.png'] # alternatively: # input_location = '/dir/to/images' @@ -58,9 +54,7 @@ class ImageMattingTest(unittest.TestCase): def test_run_modelhub(self): img_matting = pipeline(Tasks.image_matting, model=self.model_id) - result = img_matting( - 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' - ) + result = img_matting('data/test/images/image_matting.png') cv2.imwrite('result.png', result['output_png']) print(f'Output written to {osp.abspath("result.png")}') @@ -68,12 +62,21 @@ class ImageMattingTest(unittest.TestCase): def test_run_modelhub_default_model(self): img_matting = pipeline(Tasks.image_matting) - result = img_matting( - 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' - ) + result = img_matting('data/test/images/image_matting.png') cv2.imwrite('result.png', result['output_png']) print(f'Output written to {osp.abspath("result.png")}') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_with_modelscope_dataset(self): + dataset = PyDataset.load('beans', split='train', target='image') + img_matting = pipeline(Tasks.image_matting, model=self.model_id) + result = img_matting(dataset) + for i in range(10): + cv2.imwrite(f'result_{i}.png', next(result)['output_png']) + print( + f'Output written to dir: {osp.dirname(osp.abspath("result_0.png"))}' + ) + if __name__ == '__main__': unittest.main() diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py new file mode 100644 index 00000000..62fcedd3 --- /dev/null +++ b/tests/pipelines/test_ocr_detection.py @@ -0,0 +1,37 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +import shutil +import sys +import tempfile +import unittest +from typing import Any, Dict, List, Tuple, Union + +import cv2 +import numpy as np +import PIL + +from modelscope.pipelines import pipeline +from modelscope.pipelines.base import Pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class OCRDetectionTest(unittest.TestCase): + + def setUp(self) -> None: + self.model_id = 'damo/cv_resnet18_ocr-detection-line-level_damo' + self.test_image = 'data/test/images/ocr_detection.jpg' + + def pipeline_inference(self, pipeline: Pipeline, input_location: str): + result = pipeline(input_location) + print('ocr detection results: ') + print(result) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_modelhub_default_model(self): + ocr_detection = pipeline(Tasks.ocr_detection) + self.pipeline_inference(ocr_detection, self.test_image) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py new file mode 100644 index 00000000..8b5c9468 --- /dev/null +++ b/tests/pipelines/test_speech_signal_process.py @@ -0,0 +1,56 @@ +import os.path +import shutil +import unittest + +from modelscope.fileio import File +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import get_model_cache_dir + +NEAREND_MIC_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/nearend_mic.wav' +FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/farend_speech.wav' +NEAREND_MIC_FILE = 'nearend_mic.wav' +FAREND_SPEECH_FILE = 'farend_speech.wav' + +AEC_LIB_URL = 'http://isv-data.oss-cn-hangzhou.aliyuncs.com/ics%2FMaaS%2FAEC%2Flib%2Flibmitaec_pyio.so' \ + '?Expires=1664085465&OSSAccessKeyId=LTAIxjQyZNde90zh&Signature=Y7gelmGEsQAJRK4yyHSYMrdWizk%3D' +AEC_LIB_FILE = 'libmitaec_pyio.so' + + +def download(remote_path, local_path): + local_dir = os.path.dirname(local_path) + if len(local_dir) > 0: + if not os.path.exists(local_dir): + os.makedirs(local_dir) + with open(local_path, 'wb') as ofile: + ofile.write(File.read(remote_path)) + + +class SpeechSignalProcessTest(unittest.TestCase): + + def setUp(self) -> None: + self.model_id = 'damo/speech_dfsmn_aec_psm_16k' + # switch to False if downloading everytime is not desired + purge_cache = True + if purge_cache: + shutil.rmtree( + get_model_cache_dir(self.model_id), ignore_errors=True) + # A temporary hack to provide c++ lib. Download it first. + download(AEC_LIB_URL, AEC_LIB_FILE) + + def test_run(self): + download(NEAREND_MIC_URL, NEAREND_MIC_FILE) + download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE) + input = { + 'nearend_mic': NEAREND_MIC_FILE, + 'farend_speech': FAREND_SPEECH_FILE + } + aec = pipeline( + Tasks.speech_signal_process, + model=self.model_id, + pipeline_name=r'speech_dfsmn_aec_psm_16k') + aec(input, output_path='output.wav') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py index 01fdd29b..bb24fece 100644 --- a/tests/pipelines/test_text_classification.py +++ b/tests/pipelines/test_text_classification.py @@ -86,7 +86,11 @@ class SequenceClassificationTest(unittest.TestCase): task=Tasks.text_classification, model=self.model_id) result = text_classification( PyDataset.load( - 'glue', name='sst2', target='sentence', hub=Hubs.huggingface)) + 'glue', + subset_name='sst2', + split='train', + target='sentence', + hub=Hubs.huggingface)) self.printDataset(result) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') @@ -94,7 +98,11 @@ class SequenceClassificationTest(unittest.TestCase): text_classification = pipeline(task=Tasks.text_classification) result = text_classification( PyDataset.load( - 'glue', name='sst2', target='sentence', hub=Hubs.huggingface)) + 'glue', + subset_name='sst2', + split='train', + target='sentence', + hub=Hubs.huggingface)) self.printDataset(result) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') @@ -105,9 +113,21 @@ class SequenceClassificationTest(unittest.TestCase): text_classification = pipeline( Tasks.text_classification, model=model, preprocessor=preprocessor) # loaded from huggingface dataset - # TODO: rename parameter as dataset_name and subset_name dataset = PyDataset.load( - 'glue', name='sst2', target='sentence', hub=Hubs.huggingface) + 'glue', + subset_name='sst2', + split='train', + target='sentence', + hub=Hubs.huggingface) + result = text_classification(dataset) + self.printDataset(result) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_with_modelscope_dataset(self): + text_classification = pipeline(task=Tasks.text_classification) + # loaded from modelscope dataset + dataset = PyDataset.load( + 'squad', split='train', target='context', hub=Hubs.modelscope) result = text_classification(dataset) self.printDataset(result) diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py index f98e135d..fbdd165f 100644 --- a/tests/pipelines/test_text_generation.py +++ b/tests/pipelines/test_text_generation.py @@ -4,7 +4,7 @@ import unittest from maas_hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import PalmForTextGenerationModel +from modelscope.models.nlp import PalmForTextGeneration from modelscope.pipelines import TextGenerationPipeline, pipeline from modelscope.preprocessors import TextGenerationPreprocessor from modelscope.utils.constant import Tasks @@ -12,43 +12,67 @@ from modelscope.utils.test_utils import test_level class TextGenerationTest(unittest.TestCase): - model_id = 'damo/nlp_palm_text-generation_chinese' - input1 = "今日天气类型='晴'&温度变化趋势='大幅上升'&最低气温='28℃'&最高气温='31℃'&体感='湿热'" - input2 = "今日天气类型='多云'&体感='舒适'&最低气温='26℃'&最高气温='30℃'" + model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base' + model_id_en = 'damo/nlp_palm2.0_text-generation_english-base' + input_zh = """ + 本文总结了十个可穿戴产品的设计原则,而这些原则,同样也是笔者认为是这个行业最吸引人的地方: + 1.为人们解决重复性问题;2.从人开始,而不是从机器开始;3.要引起注意,但不要刻意;4.提升用户能力,而不是取代 + """ + input_en = """ + The Director of Public Prosecutions who let off Lord Janner over alleged child sex abuse started + her career at a legal chambers when the disgraced Labour peer was a top QC there . Alison Saunders , + 54 , sparked outrage last week when she decided the 86-year-old should not face astring of charges + of paedophilia against nine children because he has dementia . Today , newly-released documents + revealed damning evidence that abuse was covered up by police andsocial workers for more than 20 years . + And now it has emerged Mrs Saunders ' law career got off to a flying start when she secured her + pupillage -- a barrister 's training contract at 1 Garden Court Chambers in London in 1983 . + """ @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run(self): - cache_path = snapshot_download(self.model_id) - preprocessor = TextGenerationPreprocessor( - cache_path, first_sequence='sentence', second_sequence=None) - model = PalmForTextGenerationModel( - cache_path, tokenizer=preprocessor.tokenizer) - pipeline1 = TextGenerationPipeline(model, preprocessor) - pipeline2 = pipeline( - Tasks.text_generation, model=model, preprocessor=preprocessor) - print(f'input: {self.input1}\npipeline1: {pipeline1(self.input1)}') - print() - print(f'input: {self.input2}\npipeline2: {pipeline2(self.input2)}') + for model_id, input in ((self.model_id_zh, self.input_zh), + (self.model_id_en, self.input_en)): + cache_path = snapshot_download(model_id) + model = PalmForTextGeneration(cache_path) + preprocessor = TextGenerationPreprocessor( + cache_path, + model.tokenizer, + first_sequence='sentence', + second_sequence=None) + pipeline1 = TextGenerationPipeline(model, preprocessor) + pipeline2 = pipeline( + Tasks.text_generation, model=model, preprocessor=preprocessor) + print( + f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}' + ) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_from_modelhub(self): - model = Model.from_pretrained(self.model_id) - preprocessor = TextGenerationPreprocessor( - model.model_dir, first_sequence='sentence', second_sequence=None) - pipeline_ins = pipeline( - task=Tasks.text_generation, model=model, preprocessor=preprocessor) - print(pipeline_ins(self.input1)) + for model_id, input in ((self.model_id_zh, self.input_zh), + (self.model_id_en, self.input_en)): + model = Model.from_pretrained(model_id) + preprocessor = TextGenerationPreprocessor( + model.model_dir, + model.tokenizer, + first_sequence='sentence', + second_sequence=None) + pipeline_ins = pipeline( + task=Tasks.text_generation, + model=model, + preprocessor=preprocessor) + print(pipeline_ins(input)) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_name(self): - pipeline_ins = pipeline( - task=Tasks.text_generation, model=self.model_id) - print(pipeline_ins(self.input2)) + for model_id, input in ((self.model_id_zh, self.input_zh), + (self.model_id_en, self.input_en)): + pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id) + print(pipeline_ins(input)) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.text_generation) - print(pipeline_ins(self.input2)) + print(pipeline_ins(self.input_zh)) if __name__ == '__main__': diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py new file mode 100644 index 00000000..c9b988a1 --- /dev/null +++ b/tests/pipelines/test_text_to_speech.py @@ -0,0 +1,60 @@ +import time +import unittest + +import json +import tensorflow as tf +# NOTICE: Tensorflow 1.15 seems not so compatible with pytorch. +# A segmentation fault may be raise by pytorch cpp library +# if 'import tensorflow' in front of 'import torch'. +# Puting a 'import torch' here can bypass this incompatibility. +import torch +from scipy.io.wavfile import write + +from modelscope.fileio import File +from modelscope.models import Model, build_model +from modelscope.models.audio.tts.am import SambertNetHifi16k +from modelscope.models.audio.tts.vocoder import AttrDict, Hifigan16k +from modelscope.pipelines import pipeline +from modelscope.preprocessors import build_preprocessor +from modelscope.utils.constant import Fields, InputFields, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase): + + def test_pipeline(self): + lang_type = 'pinyin' + text = '明天天气怎么样' + preprocessor_model_id = 'damo/speech_binary_tts_frontend_resource' + am_model_id = 'damo/speech_sambert16k_tts_zhitian_emo' + voc_model_id = 'damo/speech_hifigan16k_tts_zhitian_emo' + + cfg_preprocessor = dict( + type='text_to_tacotron_symbols', + model_name=preprocessor_model_id, + lang_type=lang_type) + preprocessor = build_preprocessor(cfg_preprocessor, Fields.audio) + self.assertTrue(preprocessor is not None) + + am = Model.from_pretrained(am_model_id) + self.assertTrue(am is not None) + + voc = Model.from_pretrained(voc_model_id) + self.assertTrue(voc is not None) + + sambert_tts = pipeline( + pipeline_name='tts-sambert-hifigan-16k', + config_file='', + model=[am, voc], + preprocessor=preprocessor) + self.assertTrue(sambert_tts is not None) + + output = sambert_tts(text) + self.assertTrue(len(output['output']) > 0) + write('output.wav', 16000, output['output']) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py new file mode 100644 index 00000000..4ec2bf29 --- /dev/null +++ b/tests/pipelines/test_word_segmentation.py @@ -0,0 +1,62 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import shutil +import unittest + +from maas_hub.snapshot_download import snapshot_download + +from modelscope.models import Model +from modelscope.models.nlp import StructBertForTokenClassification +from modelscope.pipelines import WordSegmentationPipeline, pipeline +from modelscope.preprocessors import TokenClassifcationPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.hub import get_model_cache_dir +from modelscope.utils.test_utils import test_level + + +class WordSegmentationTest(unittest.TestCase): + model_id = 'damo/nlp_structbert_word-segmentation_chinese-base' + sentence = '今天天气不错,适合出去游玩' + + def setUp(self) -> None: + # switch to False if downloading everytime is not desired + purge_cache = True + if purge_cache: + shutil.rmtree( + get_model_cache_dir(self.model_id), ignore_errors=True) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_by_direct_model_download(self): + cache_path = snapshot_download(self.model_id) + tokenizer = TokenClassifcationPreprocessor(cache_path) + model = StructBertForTokenClassification( + cache_path, tokenizer=tokenizer) + pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer) + pipeline2 = pipeline( + Tasks.word_segmentation, model=model, preprocessor=tokenizer) + print(f'sentence: {self.sentence}\n' + f'pipeline1:{pipeline1(input=self.sentence)}') + print() + print(f'pipeline2: {pipeline2(input=self.sentence)}') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) + tokenizer = TokenClassifcationPreprocessor(model.model_dir) + pipeline_ins = pipeline( + task=Tasks.word_segmentation, model=model, preprocessor=tokenizer) + print(pipeline_ins(input=self.sentence)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name(self): + pipeline_ins = pipeline( + task=Tasks.word_segmentation, model=self.model_id) + print(pipeline_ins(input=self.sentence)) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_default_model(self): + pipeline_ins = pipeline(task=Tasks.word_segmentation) + print(pipeline_ins(input=self.sentence)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/preprocessors/test_image.py b/tests/preprocessors/test_image.py index cfa7b11d..21ae780e 100644 --- a/tests/preprocessors/test_image.py +++ b/tests/preprocessors/test_image.py @@ -11,9 +11,7 @@ from modelscope.utils.logger import get_logger class ImagePreprocessorTest(unittest.TestCase): def test_load(self): - img = load_image( - 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' - ) + img = load_image('data/test/images/image_matting.png') self.assertTrue(isinstance(img, PIL.Image.Image)) self.assertEqual(img.size, (948, 533)) diff --git a/tests/preprocessors/test_text_to_speech.py b/tests/preprocessors/test_text_to_speech.py new file mode 100644 index 00000000..18b66987 --- /dev/null +++ b/tests/preprocessors/test_text_to_speech.py @@ -0,0 +1,28 @@ +import shutil +import unittest + +from modelscope.preprocessors import build_preprocessor +from modelscope.utils.constant import Fields, InputFields +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +class TtsPreprocessorTest(unittest.TestCase): + + def test_preprocess(self): + lang_type = 'pinyin' + text = '今天天气不错,我们去散步吧。' + cfg = dict( + type='text_to_tacotron_symbols', + model_name='damo/speech_binary_tts_frontend_resource', + lang_type=lang_type) + preprocessor = build_preprocessor(cfg, Fields.audio) + output = preprocessor(text) + self.assertTrue(output) + for line in output['texts']: + print(line) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pydatasets/test_py_dataset.py b/tests/pydatasets/test_py_dataset.py index 7accd814..4ad767fa 100644 --- a/tests/pydatasets/test_py_dataset.py +++ b/tests/pydatasets/test_py_dataset.py @@ -2,42 +2,111 @@ import unittest import datasets as hfdata +from modelscope.models import Model +from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.preprocessors.base import Preprocessor from modelscope.pydatasets import PyDataset +from modelscope.utils.constant import Hubs +from modelscope.utils.test_utils import require_tf, require_torch, test_level + + +class ImgPreprocessor(Preprocessor): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.path_field = kwargs.pop('image_path', 'image_path') + self.width = kwargs.pop('width', 'width') + self.height = kwargs.pop('height', 'width') + + def __call__(self, data): + import cv2 + image_path = data.get(self.path_field) + if not image_path: + return None + img = cv2.imread(image_path) + return { + 'image': + cv2.resize(img, + (data.get(self.height, 128), data.get(self.width, 128))) + } class PyDatasetTest(unittest.TestCase): - def setUp(self): - # ds1 initialized from in memory json - self.json_data = { - 'dummy': [{ - 'a': i, - 'x': i * 10, - 'c': i * 100 - } for i in range(1, 11)] - } - hfds1 = hfdata.Dataset.from_dict(self.json_data) - self.ds1 = PyDataset.from_hf_dataset(hfds1) + def test_ds_basic(self): + ms_ds_full = PyDataset.load('squad') + ms_ds_full_hf = hfdata.load_dataset('squad') + ms_ds_train = PyDataset.load('squad', split='train') + ms_ds_train_hf = hfdata.load_dataset('squad', split='train') + ms_image_train = PyDataset.from_hf_dataset( + hfdata.load_dataset('beans', split='train')) + self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0]) + self.assertEqual(ms_ds_full['validation'][0], + ms_ds_full_hf['validation'][0]) + self.assertEqual(ms_ds_train[0], ms_ds_train_hf[0]) + print(next(iter(ms_ds_full['train']))) + print(next(iter(ms_ds_train))) + print(next(iter(ms_image_train))) - # ds2 initialized from hg hub - hfds2 = hfdata.load_dataset( - 'glue', 'mrpc', revision='2.0.0', split='train') - self.ds2 = PyDataset.from_hf_dataset(hfds2) + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + @require_torch + def test_to_torch_dataset_text(self): + model_id = 'damo/bert-base-sst2' + nlp_model = Model.from_pretrained(model_id) + preprocessor = SequenceClassificationPreprocessor( + nlp_model.model_dir, + first_sequence='context', + second_sequence=None) + ms_ds_train = PyDataset.load('squad', split='train') + pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) + import torch + dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) + print(next(iter(dataloader))) - def tearDown(self): - pass + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + @require_tf + def test_to_tf_dataset_text(self): + import tensorflow as tf + tf.compat.v1.enable_eager_execution() + model_id = 'damo/bert-base-sst2' + nlp_model = Model.from_pretrained(model_id) + preprocessor = SequenceClassificationPreprocessor( + nlp_model.model_dir, + first_sequence='context', + second_sequence=None) + ms_ds_train = PyDataset.load('squad', split='train') + tf_dataset = ms_ds_train.to_tf_dataset( + batch_size=5, + shuffle=True, + preprocessors=preprocessor, + drop_remainder=True) + print(next(iter(tf_dataset))) - def test_to_hf_dataset(self): - hfds = self.ds1.to_hf_dataset() - hfds1 = hfdata.Dataset.from_dict(self.json_data) - self.assertEqual(hfds.data, hfds1.data) + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + @require_torch + def test_to_torch_dataset_img(self): + ms_image_train = PyDataset.from_hf_dataset( + hfdata.load_dataset('beans', split='train')) + pt_dataset = ms_image_train.to_torch_dataset( + preprocessors=ImgPreprocessor( + image_path='image_file_path', label='labels')) + import torch + dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) + print(next(iter(dataloader))) - # simple map function - hfds = hfds.map(lambda e: {'new_feature': e['dummy']['a']}) - self.assertEqual(len(hfds['new_feature']), 10) - - hfds2 = self.ds2.to_hf_dataset() - self.assertTrue(hfds2[0]['sentence1'].startswith('Amrozi')) + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + @require_tf + def test_to_tf_dataset_img(self): + import tensorflow as tf + tf.compat.v1.enable_eager_execution() + ms_image_train = PyDataset.load('beans', split='train') + tf_dataset = ms_image_train.to_tf_dataset( + batch_size=5, + shuffle=True, + preprocessors=ImgPreprocessor(image_path='image_file_path'), + drop_remainder=True, + label_cols='labels') + print(next(iter(tf_dataset))) if __name__ == '__main__': diff --git a/tests/run.py b/tests/run.py index 9f5d62a7..a904ba8e 100644 --- a/tests/run.py +++ b/tests/run.py @@ -7,6 +7,12 @@ import sys import unittest from fnmatch import fnmatch +# NOTICE: Tensorflow 1.15 seems not so compatible with pytorch. +# A segmentation fault may be raise by pytorch cpp library +# if 'import tensorflow' in front of 'import torch'. +# Puting a 'import torch' here can bypass this incompatibility. +import torch + from modelscope.utils.logger import get_logger from modelscope.utils.test_utils import set_test_level, test_level