mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-24 03:59:23 +01:00
merge with master
This commit is contained in:
3
.gitattributes
vendored
Normal file
3
.gitattributes
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
*.png filter=lfs diff=lfs merge=lfs -text
|
||||
*.jpg filter=lfs diff=lfs merge=lfs -text
|
||||
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -24,6 +24,7 @@ wheels/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
/package
|
||||
/temp
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
@@ -104,7 +105,6 @@ venv.bak/
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
data
|
||||
.vscode
|
||||
.idea
|
||||
|
||||
@@ -124,3 +124,7 @@ replace.sh
|
||||
|
||||
# Pytorch
|
||||
*.pth
|
||||
|
||||
|
||||
# audio
|
||||
*.wav
|
||||
|
||||
3
data/test/images/image1.jpg
Normal file
3
data/test/images/image1.jpg
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:78094cc48fbcfd9b6d321fe13619ecc72b65e006fc1b4c4458409ade9979486d
|
||||
size 129862
|
||||
3
data/test/images/image_captioning.png
Normal file
3
data/test/images/image_captioning.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
|
||||
size 603621
|
||||
3
data/test/images/image_matting.png
Normal file
3
data/test/images/image_matting.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
|
||||
size 603621
|
||||
3
data/test/images/ocr_detection.jpg
Normal file
3
data/test/images/ocr_detection.jpg
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5c8435db5583400be5d11a2c17910c96133b462c8a99ccaf0e19f4aac34e0a94
|
||||
size 141149
|
||||
@@ -91,6 +91,55 @@ make tests
|
||||
|
||||
4. Daily regression tests will run all cases at 0 am each day using master branch.
|
||||
|
||||
### 2.3 Test data storage
|
||||
|
||||
As we need a lot of data for testing, including images, videos, models. We use git lfs
|
||||
to store those large files.
|
||||
|
||||
1. install git-lfs
|
||||
for mac
|
||||
```bash
|
||||
brew install git-lfs
|
||||
git lfs install
|
||||
```
|
||||
|
||||
for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
|
||||
```bash
|
||||
wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
|
||||
sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
|
||||
git lfs install
|
||||
```
|
||||
|
||||
for ubuntu
|
||||
```bash
|
||||
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
|
||||
sudo apt-get install git-lfs
|
||||
git lfs install
|
||||
```
|
||||
|
||||
2. track your data type using git lfs, for example, to track png files
|
||||
```bash
|
||||
git lfs track "*.png"
|
||||
```
|
||||
|
||||
3. add your test files to `data/test/` folder, you can make directories if you need.
|
||||
```bash
|
||||
git add data/test/test.png
|
||||
```
|
||||
|
||||
4. commit your test data to remote branch
|
||||
```bash
|
||||
git commit -m "xxx"
|
||||
```
|
||||
|
||||
To pull data from remote repo, just as the same way you pull git files.
|
||||
```bash
|
||||
git pull origin branch_name
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
## Code Review
|
||||
|
||||
1. Run following command to create an aone CR, replace `TARGET_BRANCH` and `CR_NAME` with the one you want.
|
||||
|
||||
@@ -29,3 +29,15 @@ reference: [https://huggingface.co/docs/tokenizers/installation#installation-fro
|
||||
> ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
|
||||
|
||||
由于依赖库之间的版本不兼容,可能会存在版本冲突的情况,大部分情况下不影响正常运行。
|
||||
|
||||
### 3. 安装pytorch出现版本错误
|
||||
|
||||
> ERROR: Ignored the following versions that require a different python version: 1.1.0 Requires-Python >=3.8; 1.1.0rc1 Requires-Python >=3.8; 1.1.1 Requires-Python >=3.8
|
||||
> ERROR: Could not find a version that satisfies the requirement torch==1.8.1+cu111 (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0)
|
||||
> ERROR: No matching distribution found for torch==1.8.1+cu111
|
||||
|
||||
安装时使用如下命令:
|
||||
|
||||
```shell
|
||||
pip install -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
|
||||
```
|
||||
|
||||
@@ -25,6 +25,10 @@ ModelScope Library目前支持tensorflow,pytorch两大深度学习框架进行
|
||||
* [Pytorch安装指导](https://pytorch.org/get-started/locally/)
|
||||
* [Tensorflow安装指导](https://www.tensorflow.org/install/pip)
|
||||
|
||||
部分第三方依赖库需要提前安装numpy
|
||||
```
|
||||
pip install numpy
|
||||
```
|
||||
|
||||
## ModelScope library 安装
|
||||
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from .audio.tts.am import SambertNetHifi16k
|
||||
from .audio.tts.vocoder import Hifigan16k
|
||||
from .base import Model
|
||||
from .builder import MODELS, build_model
|
||||
from .multi_model import OfaForImageCaptioning
|
||||
from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity
|
||||
|
||||
0
modelscope/models/audio/__init__.py
Normal file
0
modelscope/models/audio/__init__.py
Normal file
0
modelscope/models/audio/layers/__init__.py
Normal file
0
modelscope/models/audio/layers/__init__.py
Normal file
60
modelscope/models/audio/layers/activations.py
Normal file
60
modelscope/models/audio/layers/activations.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import torch.nn as nn
|
||||
|
||||
from .layer_base import LayerBase
|
||||
|
||||
|
||||
class RectifiedLinear(LayerBase):
|
||||
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(RectifiedLinear, self).__init__()
|
||||
self.dim = input_dim
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
def forward(self, input):
|
||||
return self.relu(input)
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
|
||||
return re_str
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
return instr
|
||||
|
||||
|
||||
class LogSoftmax(LayerBase):
|
||||
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(LogSoftmax, self).__init__()
|
||||
self.dim = input_dim
|
||||
self.ls = nn.LogSoftmax()
|
||||
|
||||
def forward(self, input):
|
||||
return self.ls(input)
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<Softmax> %d %d\n' % (self.dim, self.dim)
|
||||
return re_str
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
return instr
|
||||
|
||||
|
||||
class Sigmoid(LayerBase):
|
||||
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(Sigmoid, self).__init__()
|
||||
self.dim = input_dim
|
||||
self.sig = nn.Sigmoid()
|
||||
|
||||
def forward(self, input):
|
||||
return self.sig(input)
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<Sigmoid> %d %d\n' % (self.dim, self.dim)
|
||||
return re_str
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
return instr
|
||||
78
modelscope/models/audio/layers/affine_transform.py
Normal file
78
modelscope/models/audio/layers/affine_transform.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
|
||||
from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
|
||||
to_kaldi_matrix)
|
||||
|
||||
|
||||
class AffineTransform(LayerBase):
|
||||
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(AffineTransform, self).__init__()
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.linear = nn.Linear(input_dim, output_dim)
|
||||
|
||||
def forward(self, input):
|
||||
return self.linear(input)
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
|
||||
self.input_dim)
|
||||
re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
return re_str
|
||||
|
||||
def to_raw_nnet(self, fid):
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LearnRateCoef>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('AffineTransform format error for <LearnRateCoef>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_token_number(instr, '<BiasLearnRateCoef>')
|
||||
if output is None:
|
||||
raise Exception(
|
||||
'AffineTransform format error for <BiasLearnRateCoef>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_token_number(instr, '<MaxNorm>')
|
||||
if output is None:
|
||||
raise Exception('AffineTransform format error for <MaxNorm>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('AffineTransform format error for parsing matrix')
|
||||
instr, mat = output
|
||||
|
||||
print(mat.shape)
|
||||
self.linear.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('AffineTransform format error for parsing matrix')
|
||||
instr, mat = output
|
||||
mat = np.squeeze(mat)
|
||||
self.linear.bias = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
return instr
|
||||
178
modelscope/models/audio/layers/deep_fsmn.py
Normal file
178
modelscope/models/audio/layers/deep_fsmn.py
Normal file
@@ -0,0 +1,178 @@
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
|
||||
to_kaldi_matrix)
|
||||
|
||||
|
||||
class DeepFsmn(LayerBase):
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=None,
|
||||
rorder=None,
|
||||
hidden_size=None,
|
||||
layer_norm=False,
|
||||
dropout=0):
|
||||
super(DeepFsmn, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
|
||||
if lorder is None:
|
||||
return
|
||||
|
||||
self.lorder = lorder
|
||||
self.rorder = rorder
|
||||
self.hidden_size = hidden_size
|
||||
self.layer_norm = layer_norm
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
self.norm = nn.LayerNorm(hidden_size)
|
||||
self.drop1 = nn.Dropout(p=dropout)
|
||||
self.drop2 = nn.Dropout(p=dropout)
|
||||
self.project = nn.Linear(hidden_size, output_dim, bias=False)
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [lorder, 1], [1, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
self.conv2 = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [rorder, 1], [1, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
|
||||
def forward(self, input):
|
||||
|
||||
f1 = F.relu(self.linear(input))
|
||||
|
||||
f1 = self.drop1(f1)
|
||||
if self.layer_norm:
|
||||
f1 = self.norm(f1)
|
||||
|
||||
p1 = self.project(f1)
|
||||
|
||||
x = th.unsqueeze(p1, 1)
|
||||
|
||||
x_per = x.permute(0, 3, 2, 1)
|
||||
|
||||
y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
|
||||
yr = F.pad(x_per, [0, 0, 0, self.rorder])
|
||||
yr = yr[:, :, 1:, :]
|
||||
|
||||
out = x_per + self.conv1(y) + self.conv2(yr)
|
||||
out = self.drop2(out)
|
||||
|
||||
out1 = out.permute(0, 3, 2, 1)
|
||||
|
||||
return input + out1.squeeze()
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<UniDeepFsmn> %d %d\n'\
|
||||
% (self.output_dim, self.input_dim)
|
||||
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n'\
|
||||
% (1, self.hidden_size, self.lorder, 1)
|
||||
lfiters = self.state_dict()['conv1.weight']
|
||||
x = np.flipud(lfiters.squeeze().numpy().T)
|
||||
re_str += to_kaldi_matrix(x)
|
||||
proj_weights = self.state_dict()['project.weight']
|
||||
x = proj_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
return re_str
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LearnRateCoef>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<HidSize>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <HidSize>')
|
||||
instr, hiddensize = output
|
||||
self.hidden_size = int(hiddensize)
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LOrder>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LOrder>')
|
||||
instr, lorder = output
|
||||
self.lorder = int(lorder)
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LStride>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LStride>')
|
||||
instr, lstride = output
|
||||
self.lstride = lstride
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<MaxNorm>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <MaxNorm>')
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
mat1 = np.fliplr(mat.T).copy()
|
||||
self.conv1 = nn.Conv2d(
|
||||
self.output_dim,
|
||||
self.output_dim, [self.lorder, 1], [1, 1],
|
||||
groups=self.output_dim,
|
||||
bias=False)
|
||||
mat_th = th.from_numpy(mat1).type(th.FloatTensor)
|
||||
mat_th = mat_th.unsqueeze(1)
|
||||
mat_th = mat_th.unsqueeze(3)
|
||||
self.conv1.weight = th.nn.Parameter(mat_th)
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
|
||||
self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
|
||||
self.linear = nn.Linear(self.input_dim, self.hidden_size)
|
||||
|
||||
self.project.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
self.linear.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
self.linear.bias = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
return instr
|
||||
50
modelscope/models/audio/layers/layer_base.py
Normal file
50
modelscope/models/audio/layers/layer_base.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import abc
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
def expect_token_number(instr, token):
|
||||
first_token = re.match(r'^\s*' + token, instr)
|
||||
if first_token is None:
|
||||
return None
|
||||
instr = instr[first_token.end():]
|
||||
lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr)
|
||||
if lr is None:
|
||||
return None
|
||||
return instr[lr.end():], lr.groups()[0]
|
||||
|
||||
|
||||
def expect_kaldi_matrix(instr):
|
||||
pos2 = instr.find('[', 0)
|
||||
pos3 = instr.find(']', pos2)
|
||||
mat = []
|
||||
for stt in instr[pos2 + 1:pos3].split('\n'):
|
||||
tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ')
|
||||
if tmp_mat.size > 0:
|
||||
mat.append(tmp_mat)
|
||||
return instr[pos3 + 1:], np.array(mat)
|
||||
|
||||
|
||||
def to_kaldi_matrix(np_mat):
|
||||
"""
|
||||
function that transform as str numpy mat to standard kaldi str matrix
|
||||
:param np_mat: numpy mat
|
||||
:return: str
|
||||
"""
|
||||
np.set_printoptions(threshold=np.inf, linewidth=np.nan, suppress=True)
|
||||
out_str = str(np_mat)
|
||||
out_str = out_str.replace('[', '')
|
||||
out_str = out_str.replace(']', '')
|
||||
return '[ %s ]\n' % out_str
|
||||
|
||||
|
||||
class LayerBase(nn.Module, metaclass=abc.ABCMeta):
|
||||
|
||||
def __init__(self):
|
||||
super(LayerBase, self).__init__()
|
||||
|
||||
@abc.abstractmethod
|
||||
def to_kaldi_nnet(self):
|
||||
pass
|
||||
482
modelscope/models/audio/layers/uni_deep_fsmn.py
Normal file
482
modelscope/models/audio/layers/uni_deep_fsmn.py
Normal file
@@ -0,0 +1,482 @@
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
|
||||
to_kaldi_matrix)
|
||||
|
||||
|
||||
class SepConv(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
filters,
|
||||
out_channels,
|
||||
kernel_size=(5, 2),
|
||||
dilation=(1, 1)):
|
||||
""" :param kernel_size (time, frequency)
|
||||
|
||||
"""
|
||||
super(SepConv, self).__init__()
|
||||
# depthwise + pointwise
|
||||
self.dconv = nn.Conv2d(
|
||||
in_channels,
|
||||
in_channels * filters,
|
||||
kernel_size,
|
||||
dilation=dilation,
|
||||
groups=in_channels)
|
||||
self.pconv = nn.Conv2d(
|
||||
in_channels * filters, out_channels, kernel_size=1)
|
||||
self.padding = dilation[0] * (kernel_size[0] - 1)
|
||||
|
||||
def forward(self, input):
|
||||
''' input: [B, C, T, F]
|
||||
'''
|
||||
x = F.pad(input, [0, 0, self.padding, 0])
|
||||
x = self.dconv(x)
|
||||
x = self.pconv(x)
|
||||
return x
|
||||
|
||||
|
||||
class Conv2d(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
groups=1,
|
||||
bias=False,
|
||||
skip_connect=True):
|
||||
super(Conv2d, self).__init__()
|
||||
self.lorder = lorder
|
||||
self.conv = nn.Conv2d(
|
||||
input_dim, output_dim, [lorder, 1], groups=groups, bias=bias)
|
||||
self.rorder = rorder
|
||||
if self.rorder:
|
||||
self.conv2 = nn.Conv2d(
|
||||
input_dim, output_dim, [rorder, 1], groups=groups, bias=bias)
|
||||
self.skip_connect = skip_connect
|
||||
|
||||
def forward(self, input):
|
||||
# [B, 1, T, F]
|
||||
x = th.unsqueeze(input, 1)
|
||||
# [B, F, T, 1]
|
||||
x_per = x.permute(0, 3, 2, 1)
|
||||
y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
|
||||
out = self.conv(y)
|
||||
if self.rorder:
|
||||
yr = F.pad(x_per, [0, 0, 0, self.rorder])
|
||||
yr = yr[:, :, 1:, :]
|
||||
out += self.conv2(yr)
|
||||
out = out.permute(0, 3, 2, 1).squeeze(1)
|
||||
if self.skip_connect:
|
||||
out = out + input
|
||||
return out
|
||||
|
||||
|
||||
class SelfAttLayer(nn.Module):
|
||||
|
||||
def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
|
||||
super(SelfAttLayer, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
|
||||
if lorder is None:
|
||||
return
|
||||
|
||||
self.lorder = lorder
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
|
||||
self.project = nn.Linear(hidden_size, output_dim, bias=False)
|
||||
|
||||
self.att = nn.Linear(input_dim, lorder, bias=False)
|
||||
|
||||
def forward(self, input):
|
||||
|
||||
f1 = F.relu(self.linear(input))
|
||||
|
||||
p1 = self.project(f1)
|
||||
|
||||
x = th.unsqueeze(p1, 1)
|
||||
|
||||
x_per = x.permute(0, 3, 2, 1)
|
||||
|
||||
y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
|
||||
|
||||
# z [B, F, T, lorder]
|
||||
z = x_per
|
||||
for i in range(1, self.lorder):
|
||||
z = th.cat([z, y[:, :, self.lorder - 1 - i:-i, :]], axis=-1)
|
||||
|
||||
# [B, T, lorder]
|
||||
att = F.softmax(self.att(input), dim=-1)
|
||||
att = th.unsqueeze(att, 1)
|
||||
z = th.sum(z * att, axis=-1)
|
||||
|
||||
out1 = z.permute(0, 2, 1)
|
||||
|
||||
return input + out1
|
||||
|
||||
|
||||
class TFFsmn(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=None,
|
||||
hidden_size=None,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
skip_connect=True):
|
||||
super(TFFsmn, self).__init__()
|
||||
|
||||
self.skip_connect = skip_connect
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
self.norm = nn.Identity()
|
||||
if layer_norm:
|
||||
self.norm = nn.LayerNorm(input_dim)
|
||||
self.act = nn.ReLU()
|
||||
self.project = nn.Linear(hidden_size, output_dim, bias=False)
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [lorder, 1],
|
||||
dilation=[dilation, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
self.padding_left = dilation * (lorder - 1)
|
||||
dorder = 5
|
||||
self.conv2 = nn.Conv2d(1, 1, [dorder, 1], bias=False)
|
||||
self.padding_freq = dorder - 1
|
||||
|
||||
def forward(self, input):
|
||||
return self.compute1(input)
|
||||
|
||||
def compute1(self, input):
|
||||
''' linear-dconv-relu(norm)-linear-dconv
|
||||
'''
|
||||
x = self.linear(input)
|
||||
# [B, 1, F, T]
|
||||
x = th.unsqueeze(x, 1).permute(0, 1, 3, 2)
|
||||
z = F.pad(x, [0, 0, self.padding_freq, 0])
|
||||
z = self.conv2(z) + x
|
||||
x = z.permute(0, 3, 2, 1).squeeze(-1)
|
||||
x = self.act(x)
|
||||
x = self.norm(x)
|
||||
x = self.project(x)
|
||||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
|
||||
# [B, F, T+lorder-1, 1]
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.conv1(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
|
||||
return input + out
|
||||
|
||||
|
||||
class CNNFsmn(nn.Module):
|
||||
''' use cnn to reduce parameters
|
||||
'''
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=None,
|
||||
hidden_size=None,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
skip_connect=True):
|
||||
super(CNNFsmn, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.skip_connect = skip_connect
|
||||
|
||||
if lorder is None:
|
||||
return
|
||||
|
||||
self.lorder = lorder
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
self.act = nn.ReLU()
|
||||
kernel_size = (3, 8)
|
||||
stride = (1, 4)
|
||||
self.conv = nn.Sequential(
|
||||
nn.ConstantPad2d((stride[1], 0, kernel_size[0] - 1, 0), 0),
|
||||
nn.Conv2d(1, stride[1], kernel_size=kernel_size, stride=stride))
|
||||
|
||||
self.dconv = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [lorder, 1],
|
||||
dilation=[dilation, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
self.padding_left = dilation * (lorder - 1)
|
||||
|
||||
def forward(self, input):
|
||||
return self.compute2(input)
|
||||
|
||||
def compute1(self, input):
|
||||
''' linear-relu(norm)-conv2d-relu?-dconv
|
||||
'''
|
||||
# [B, T, F]
|
||||
x = self.linear(input)
|
||||
x = self.act(x)
|
||||
x = th.unsqueeze(x, 1)
|
||||
x = self.conv(x)
|
||||
# [B, C, T, F] -> [B, 1, T, F]
|
||||
b, c, t, f = x.shape
|
||||
x = x.view([b, 1, t, -1])
|
||||
x = x.permute(0, 3, 2, 1)
|
||||
# [B, F, T+lorder-1, 1]
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.dconv(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
return input + out
|
||||
|
||||
def compute2(self, input):
|
||||
''' conv2d-relu-linear-relu?-dconv
|
||||
'''
|
||||
x = th.unsqueeze(input, 1)
|
||||
x = self.conv(x)
|
||||
x = self.act(x)
|
||||
# [B, C, T, F] -> [B, T, F]
|
||||
b, c, t, f = x.shape
|
||||
x = x.view([b, t, -1])
|
||||
x = self.linear(x)
|
||||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.dconv(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
return input + out
|
||||
|
||||
|
||||
class UniDeepFsmn(LayerBase):
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=None,
|
||||
hidden_size=None,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
skip_connect=True):
|
||||
super(UniDeepFsmn, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.skip_connect = skip_connect
|
||||
|
||||
if lorder is None:
|
||||
return
|
||||
|
||||
self.lorder = lorder
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
self.norm = nn.Identity()
|
||||
if layer_norm:
|
||||
self.norm = nn.LayerNorm(input_dim)
|
||||
self.act = nn.ReLU()
|
||||
self.project = nn.Linear(hidden_size, output_dim, bias=False)
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [lorder, 1],
|
||||
dilation=[dilation, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
self.padding_left = dilation * (lorder - 1)
|
||||
|
||||
def forward(self, input):
|
||||
return self.compute1(input)
|
||||
|
||||
def compute1(self, input):
|
||||
''' linear-relu(norm)-linear-dconv
|
||||
'''
|
||||
# [B, T, F]
|
||||
x = self.linear(input)
|
||||
x = self.act(x)
|
||||
x = self.norm(x)
|
||||
x = self.project(x)
|
||||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
|
||||
# [B, F, T+lorder-1, 1]
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.conv1(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
|
||||
return input + out
|
||||
|
||||
def compute2(self, input):
|
||||
''' linear-dconv-linear-relu(norm)
|
||||
'''
|
||||
x = self.project(input)
|
||||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.conv1(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
x = self.linear(out)
|
||||
x = self.act(x)
|
||||
x = self.norm(x)
|
||||
|
||||
return input + x
|
||||
|
||||
def compute3(self, input):
|
||||
''' dconv-linear-relu(norm)-linear
|
||||
'''
|
||||
x = th.unsqueeze(input, 1).permute(0, 3, 2, 1)
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.conv1(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
x = self.linear(out)
|
||||
x = self.act(x)
|
||||
x = self.norm(x)
|
||||
x = self.project(x)
|
||||
|
||||
return input + x
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<UniDeepFsmn> %d %d\n' \
|
||||
% (self.output_dim, self.input_dim)
|
||||
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n' \
|
||||
% (1, self.hidden_size, self.lorder, 1)
|
||||
lfiters = self.state_dict()['conv1.weight']
|
||||
x = np.flipud(lfiters.squeeze().numpy().T)
|
||||
re_str += to_kaldi_matrix(x)
|
||||
proj_weights = self.state_dict()['project.weight']
|
||||
x = proj_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
return re_str
|
||||
|
||||
def to_raw_nnet(self, fid):
|
||||
lfiters = self.state_dict()['conv1.weight']
|
||||
x = np.flipud(lfiters.squeeze().numpy().T)
|
||||
x.tofile(fid)
|
||||
|
||||
proj_weights = self.state_dict()['project.weight']
|
||||
x = proj_weights.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LearnRateCoef>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<HidSize>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <HidSize>')
|
||||
instr, hiddensize = output
|
||||
self.hidden_size = int(hiddensize)
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LOrder>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LOrder>')
|
||||
instr, lorder = output
|
||||
self.lorder = int(lorder)
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LStride>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LStride>')
|
||||
instr, lstride = output
|
||||
self.lstride = lstride
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<MaxNorm>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <MaxNorm>')
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
mat1 = np.fliplr(mat.T).copy()
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
self.output_dim,
|
||||
self.output_dim, [self.lorder, 1], [1, 1],
|
||||
groups=self.output_dim,
|
||||
bias=False)
|
||||
|
||||
mat_th = th.from_numpy(mat1).type(th.FloatTensor)
|
||||
mat_th = mat_th.unsqueeze(1)
|
||||
mat_th = mat_th.unsqueeze(3)
|
||||
self.conv1.weight = th.nn.Parameter(mat_th)
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
|
||||
self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
|
||||
self.linear = nn.Linear(self.input_dim, self.hidden_size)
|
||||
|
||||
self.project.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
self.linear.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
mat = np.squeeze(mat)
|
||||
self.linear.bias = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
return instr
|
||||
0
modelscope/models/audio/network/__init__.py
Normal file
0
modelscope/models/audio/network/__init__.py
Normal file
394
modelscope/models/audio/network/loss.py
Normal file
394
modelscope/models/audio/network/loss.py
Normal file
@@ -0,0 +1,394 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .modulation_loss import (GaborSTRFConv, MelScale,
|
||||
ModulationDomainLossModule)
|
||||
|
||||
EPS = 1e-8
|
||||
|
||||
|
||||
def compute_mask(mixed_spec, clean_spec, mask_type='psmiam', clip=1):
|
||||
'''
|
||||
stft: (batch, ..., 2) or complex(batch, ...)
|
||||
y = x + n
|
||||
'''
|
||||
if torch.is_complex(mixed_spec):
|
||||
yr, yi = mixed_spec.real, mixed_spec.imag
|
||||
else:
|
||||
yr, yi = mixed_spec[..., 0], mixed_spec[..., 1]
|
||||
if torch.is_complex(clean_spec):
|
||||
xr, xi = clean_spec.real, clean_spec.imag
|
||||
else:
|
||||
xr, xi = clean_spec[..., 0], clean_spec[..., 1]
|
||||
|
||||
if mask_type == 'iam':
|
||||
ymag = torch.sqrt(yr**2 + yi**2)
|
||||
xmag = torch.sqrt(xr**2 + xi**2)
|
||||
iam = xmag / (ymag + EPS)
|
||||
return torch.clamp(iam, 0, 1)
|
||||
|
||||
elif mask_type == 'psm':
|
||||
ypow = yr**2 + yi**2
|
||||
psm = (xr * yr + xi * yi) / (ypow + EPS)
|
||||
return torch.clamp(psm, 0, 1)
|
||||
|
||||
elif mask_type == 'psmiam':
|
||||
ypow = yr**2 + yi**2
|
||||
psm = (xr * yr + xi * yi) / (ypow + EPS)
|
||||
ymag = torch.sqrt(yr**2 + yi**2)
|
||||
xmag = torch.sqrt(xr**2 + xi**2)
|
||||
iam = xmag / (ymag + EPS)
|
||||
psmiam = psm * iam
|
||||
return torch.clamp(psmiam, 0, 1)
|
||||
|
||||
elif mask_type == 'crm':
|
||||
ypow = yr**2 + yi**2
|
||||
mr = (xr * yr + xi * yi) / (ypow + EPS)
|
||||
mi = (xi * yr - xr * yi) / (ypow + EPS)
|
||||
mr = torch.clamp(mr, -clip, clip)
|
||||
mi = torch.clamp(mi, -clip, clip)
|
||||
return mr, mi
|
||||
|
||||
|
||||
def energy_vad(spec,
|
||||
thdhigh=320 * 600 * 600 * 2,
|
||||
thdlow=320 * 300 * 300 * 2,
|
||||
int16=True):
|
||||
'''
|
||||
energy based vad should be accurate enough
|
||||
spec: (batch, bins, frames, 2)
|
||||
returns (batch, frames)
|
||||
'''
|
||||
energy = torch.sum(spec[..., 0]**2 + spec[..., 1]**2, dim=1)
|
||||
vad = energy > thdhigh
|
||||
idx = torch.logical_and(vad == 0, energy > thdlow)
|
||||
vad[idx] = 0.5
|
||||
return vad
|
||||
|
||||
|
||||
def modulation_loss_init(n_fft):
|
||||
gabor_strf_parameters = torch.load(
|
||||
'./network/gabor_strf_parameters.pt')['state_dict']
|
||||
gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60)
|
||||
gabor_modulation_kernels.load_state_dict(gabor_strf_parameters)
|
||||
|
||||
modulation_loss_module = ModulationDomainLossModule(
|
||||
gabor_modulation_kernels.eval())
|
||||
for param in modulation_loss_module.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
stft2mel = MelScale(
|
||||
n_mels=80, sample_rate=16000, n_stft=n_fft // 2 + 1).cuda()
|
||||
|
||||
return modulation_loss_module, stft2mel
|
||||
|
||||
|
||||
def mask_loss_function(
|
||||
loss_func='psm_loss',
|
||||
loss_type='mse', # ['mse', 'mae', 'comb']
|
||||
mask_type='psmiam',
|
||||
use_mod_loss=False,
|
||||
use_wav2vec_loss=False,
|
||||
n_fft=640,
|
||||
hop_length=320,
|
||||
EPS=1e-8,
|
||||
weight=None):
|
||||
if weight is not None:
|
||||
print(f'Use loss weight: {weight}')
|
||||
winlen = n_fft
|
||||
window = torch.hamming_window(winlen, periodic=False)
|
||||
|
||||
def stft(x, return_complex=False):
|
||||
# returns [batch, bins, frames, 2]
|
||||
return torch.stft(
|
||||
x,
|
||||
n_fft,
|
||||
hop_length,
|
||||
winlen,
|
||||
window=window.to(x.device),
|
||||
center=False,
|
||||
return_complex=return_complex)
|
||||
|
||||
def istft(x, slen):
|
||||
return torch.istft(
|
||||
x,
|
||||
n_fft,
|
||||
hop_length,
|
||||
winlen,
|
||||
window=window.to(x.device),
|
||||
center=False,
|
||||
length=slen)
|
||||
|
||||
def mask_loss(targets, masks, nframes):
|
||||
''' [Batch, Time, Frequency]
|
||||
'''
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(targets)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
masks = masks * mask_for_loss
|
||||
targets = targets * mask_for_loss
|
||||
|
||||
if weight is None:
|
||||
alpha = 1
|
||||
else: # for aec ST
|
||||
alpha = weight - targets
|
||||
|
||||
if loss_type == 'mse':
|
||||
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2))
|
||||
elif loss_type == 'mae':
|
||||
loss = torch.sum(alpha * torch.abs(targets - masks))
|
||||
else: # mse(mask), mae(mask) approx 1:2
|
||||
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2)
|
||||
+ 0.1 * alpha * torch.abs(targets - masks))
|
||||
loss /= torch.sum(nframes)
|
||||
return loss
|
||||
|
||||
def spectrum_loss(targets, spec, nframes):
|
||||
''' [Batch, Time, Frequency, 2]
|
||||
'''
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(targets[..., 0])
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
xr = spec[..., 0] * mask_for_loss
|
||||
xi = spec[..., 1] * mask_for_loss
|
||||
yr = targets[..., 0] * mask_for_loss
|
||||
yi = targets[..., 1] * mask_for_loss
|
||||
xmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) * mask_for_loss
|
||||
ymag = torch.sqrt(targets[..., 0]**2
|
||||
+ targets[..., 1]**2) * mask_for_loss
|
||||
|
||||
loss1 = torch.sum(torch.pow(xr - yr, 2) + torch.pow(xi - yi, 2))
|
||||
loss2 = torch.sum(torch.pow(xmag - ymag, 2))
|
||||
|
||||
loss = (loss1 + loss2) / torch.sum(nframes)
|
||||
return loss
|
||||
|
||||
def sa_loss_dlen(mixed, clean, masks, nframes):
|
||||
yspec = stft(mixed).permute([0, 2, 1, 3]) / 32768
|
||||
xspec = stft(clean).permute([0, 2, 1, 3]) / 32768
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(xspec[..., 0])
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
emag = ((yspec[..., 0]**2 + yspec[..., 1]**2)**0.15) * (masks**0.3)
|
||||
xmag = (xspec[..., 0]**2 + xspec[..., 1]**2)**0.15
|
||||
emag = emag * mask_for_loss
|
||||
xmag = xmag * mask_for_loss
|
||||
|
||||
loss = torch.sum(torch.pow(emag - xmag, 2)) / torch.sum(nframes)
|
||||
return loss
|
||||
|
||||
def psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed_spec = stft(mixed)
|
||||
clean_spec = stft(clean)
|
||||
targets = compute_mask(mixed_spec, clean_spec, mask_type)
|
||||
# [B, T, F]
|
||||
targets = targets.permute(0, 2, 1)
|
||||
|
||||
loss = mask_loss(targets, masks, nframes)
|
||||
|
||||
if subtask is not None:
|
||||
vadtargets = energy_vad(clean_spec)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(targets[:, :, 0])
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:] = 0
|
||||
subtask = subtask[:, :, 0] * mask_for_loss
|
||||
vadtargets = vadtargets * mask_for_loss
|
||||
|
||||
loss_vad = F.binary_cross_entropy(subtask, vadtargets)
|
||||
return loss + loss_vad
|
||||
return loss
|
||||
|
||||
def modulation_loss(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed_spec = stft(mixed, True)
|
||||
clean_spec = stft(clean, True)
|
||||
enhanced_mag = torch.abs(mixed_spec)
|
||||
clean_mag = torch.abs(clean_spec)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(clean_mag)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, :, num:] = 0
|
||||
clean_mag = clean_mag * mask_for_loss
|
||||
enhanced_mag = enhanced_mag * mask_for_loss * masks.permute([0, 2, 1])
|
||||
|
||||
# Covert to log-mel representation
|
||||
# (B,T,#mel_channels)
|
||||
clean_log_mel = torch.log(
|
||||
torch.transpose(stft2mel(clean_mag**2), 2, 1) + 1e-8)
|
||||
enhanced_log_mel = torch.log(
|
||||
torch.transpose(stft2mel(enhanced_mag**2), 2, 1) + 1e-8)
|
||||
|
||||
alpha = compute_mask(mixed_spec, clean_spec, mask_type)
|
||||
alpha = alpha.permute(0, 2, 1)
|
||||
loss = 0.05 * modulation_loss_module(enhanced_log_mel, clean_log_mel,
|
||||
alpha)
|
||||
loss2 = psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask)
|
||||
# print(loss.item(), loss2.item()) #approx 1:4
|
||||
loss = loss + loss2
|
||||
return loss
|
||||
|
||||
def wav2vec_loss(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed /= 32768
|
||||
clean /= 32768
|
||||
mixed_spec = stft(mixed)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(masks)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
masks_est = masks * mask_for_loss
|
||||
|
||||
estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
|
||||
est_clean = istft(estimate, clean.shape[1])
|
||||
loss = wav2vec_loss_module(est_clean, clean)
|
||||
return loss
|
||||
|
||||
def sisdr_loss_dlen(mixed,
|
||||
clean,
|
||||
masks,
|
||||
nframes,
|
||||
subtask=None,
|
||||
zero_mean=True):
|
||||
mixed_spec = stft(mixed)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(masks)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
masks_est = masks * mask_for_loss
|
||||
|
||||
estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
|
||||
est_clean = istft(estimate, clean.shape[1])
|
||||
flen = min(clean.shape[1], est_clean.shape[1])
|
||||
clean = clean[:, :flen]
|
||||
est_clean = est_clean[:, :flen]
|
||||
|
||||
# follow asteroid/losses/sdr.py
|
||||
if zero_mean:
|
||||
clean = clean - torch.mean(clean, dim=1, keepdim=True)
|
||||
est_clean = est_clean - torch.mean(est_clean, dim=1, keepdim=True)
|
||||
|
||||
dot = torch.sum(est_clean * clean, dim=1, keepdim=True)
|
||||
s_clean_energy = torch.sum(clean**2, dim=1, keepdim=True) + EPS
|
||||
scaled_clean = dot * clean / s_clean_energy
|
||||
e_noise = est_clean - scaled_clean
|
||||
|
||||
# [batch]
|
||||
sisdr = torch.sum(
|
||||
scaled_clean**2, dim=1) / (
|
||||
torch.sum(e_noise**2, dim=1) + EPS)
|
||||
sisdr = -10 * torch.log10(sisdr + EPS)
|
||||
loss = sisdr.mean()
|
||||
return loss
|
||||
|
||||
def sisdr_freq_loss_dlen(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed_spec = stft(mixed)
|
||||
clean_spec = stft(clean)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(masks)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
masks_est = masks * mask_for_loss
|
||||
|
||||
estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
|
||||
|
||||
dot_real = estimate[..., 0] * clean_spec[..., 0] + \
|
||||
estimate[..., 1] * clean_spec[..., 1]
|
||||
dot_imag = estimate[..., 0] * clean_spec[..., 1] - \
|
||||
estimate[..., 1] * clean_spec[..., 0]
|
||||
dot = torch.cat([dot_real.unsqueeze(3), dot_imag.unsqueeze(3)], dim=-1)
|
||||
s_clean_energy = clean_spec[..., 0] ** 2 + \
|
||||
clean_spec[..., 1] ** 2 + EPS
|
||||
scaled_clean = dot * clean_spec / s_clean_energy.unsqueeze(3)
|
||||
e_noise = estimate - scaled_clean
|
||||
|
||||
# [batch]
|
||||
scaled_clean_energy = torch.sum(
|
||||
scaled_clean[..., 0]**2 + scaled_clean[..., 1]**2, dim=1)
|
||||
e_noise_energy = torch.sum(
|
||||
e_noise[..., 0]**2 + e_noise[..., 1]**2, dim=1)
|
||||
sisdr = torch.sum(
|
||||
scaled_clean_energy, dim=1) / (
|
||||
torch.sum(e_noise_energy, dim=1) + EPS)
|
||||
sisdr = -10 * torch.log10(sisdr + EPS)
|
||||
loss = sisdr.mean()
|
||||
return loss
|
||||
|
||||
def crm_loss_dlen(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed_spec = stft(mixed).permute([0, 2, 1, 3])
|
||||
clean_spec = stft(clean).permute([0, 2, 1, 3])
|
||||
mixed_spec = mixed_spec / 32768
|
||||
clean_spec = clean_spec / 32768
|
||||
tgt_mr, tgt_mi = compute_mask(mixed_spec, clean_spec, mask_type='crm')
|
||||
|
||||
D = int(masks.shape[2] / 2)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(clean_spec[..., 0])
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
mr = masks[..., :D] * mask_for_loss
|
||||
mi = masks[..., D:] * mask_for_loss
|
||||
tgt_mr = tgt_mr * mask_for_loss
|
||||
tgt_mi = tgt_mi * mask_for_loss
|
||||
|
||||
if weight is None:
|
||||
alpha = 1
|
||||
else:
|
||||
alpha = weight - tgt_mr
|
||||
# signal approximation
|
||||
yr = mixed_spec[..., 0]
|
||||
yi = mixed_spec[..., 1]
|
||||
loss1 = torch.sum(alpha * torch.pow((mr * yr - mi * yi) - clean_spec[..., 0], 2)) \
|
||||
+ torch.sum(alpha * torch.pow((mr * yi + mi * yr) - clean_spec[..., 1], 2))
|
||||
# mask approximation
|
||||
loss2 = torch.sum(alpha * torch.pow(mr - tgt_mr, 2)) \
|
||||
+ torch.sum(alpha * torch.pow(mi - tgt_mi, 2))
|
||||
loss = 0.5 * (loss1 + loss2) / torch.sum(nframes)
|
||||
return loss
|
||||
|
||||
def crm_miso_loss_dlen(mixed, clean, masks, nframes):
|
||||
return crm_loss_dlen(mixed[..., 0], clean[..., 0], masks, nframes)
|
||||
|
||||
def mimo_loss_dlen(mixed, clean, masks, nframes):
|
||||
chs = mixed.shape[-1]
|
||||
D = masks.shape[2] // chs
|
||||
loss = psm_vad_loss_dlen(mixed[..., 0], clean[..., 0], masks[..., :D],
|
||||
nframes)
|
||||
for ch in range(1, chs):
|
||||
loss1 = psm_vad_loss_dlen(mixed[..., ch], clean[..., ch],
|
||||
masks[..., ch * D:ch * D + D], nframes)
|
||||
loss = loss + loss1
|
||||
return loss / chs
|
||||
|
||||
def spec_loss_dlen(mixed, clean, spec, nframes):
|
||||
clean_spec = stft(clean).permute([0, 2, 1, 3])
|
||||
clean_spec = clean_spec / 32768
|
||||
|
||||
D = spec.shape[2] // 2
|
||||
spec_est = torch.cat([spec[..., :D, None], spec[..., D:, None]],
|
||||
dim=-1)
|
||||
loss = spectrum_loss(clean_spec, spec_est, nframes)
|
||||
return loss
|
||||
|
||||
if loss_func == 'psm_vad_loss_dlen':
|
||||
return psm_vad_loss_dlen
|
||||
elif loss_func == 'sisdr_loss_dlen':
|
||||
return sisdr_loss_dlen
|
||||
elif loss_func == 'sisdr_freq_loss_dlen':
|
||||
return sisdr_freq_loss_dlen
|
||||
elif loss_func == 'crm_loss_dlen':
|
||||
return crm_loss_dlen
|
||||
elif loss_func == 'modulation_loss':
|
||||
return modulation_loss
|
||||
elif loss_func == 'wav2vec_loss':
|
||||
return wav2vec_loss
|
||||
elif loss_func == 'mimo_loss_dlen':
|
||||
return mimo_loss_dlen
|
||||
elif loss_func == 'spec_loss_dlen':
|
||||
return spec_loss_dlen
|
||||
elif loss_func == 'sa_loss_dlen':
|
||||
return sa_loss_dlen
|
||||
else:
|
||||
print('error loss func')
|
||||
return None
|
||||
248
modelscope/models/audio/network/modulation_loss.py
Normal file
248
modelscope/models/audio/network/modulation_loss.py
Normal file
@@ -0,0 +1,248 @@
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torchaudio.transforms import MelScale
|
||||
|
||||
|
||||
class ModulationDomainLossModule(torch.nn.Module):
|
||||
"""Modulation-domain loss function developed in [1] for supervised speech enhancement
|
||||
|
||||
In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
|
||||
as the input spectrogram representation.
|
||||
Specific parameter details are in the paper and in the example below
|
||||
|
||||
Parameters
|
||||
----------
|
||||
modulation_kernels: nn.Module
|
||||
Differentiable module that transforms a spectrogram representation to the modulation domain
|
||||
|
||||
modulation_domain = modulation_kernels(input_tf_representation)
|
||||
Input Spectrogram representation (B, T, F) ---> |(M) modulation_kernels|--->Modulation Domain(B, M, T', F')
|
||||
|
||||
norm: boolean
|
||||
Normalizes the modulation domain representation to be 0 mean across time
|
||||
|
||||
[1] T. Vuong, Y. Xia, and R. M. Stern, “A modulation-domain lossfor neural-network-based real-time
|
||||
speech enhancement”
|
||||
Accepted ICASSP 2021, https://arxiv.org/abs/2102.07330
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, modulation_kernels, norm=True):
|
||||
super(ModulationDomainLossModule, self).__init__()
|
||||
|
||||
self.modulation_kernels = modulation_kernels
|
||||
self.mse = nn.MSELoss(reduce=False)
|
||||
self.norm = norm
|
||||
|
||||
def forward(self, enhanced_spect, clean_spect, weight=None):
|
||||
"""Calculate modulation-domain loss
|
||||
Args:
|
||||
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
|
||||
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
|
||||
Returns:
|
||||
Tensor: Modulation-domain loss value.
|
||||
"""
|
||||
|
||||
clean_mod = self.modulation_kernels(clean_spect)
|
||||
enhanced_mod = self.modulation_kernels(enhanced_spect)
|
||||
|
||||
if self.norm:
|
||||
mean_clean_mod = torch.mean(clean_mod, dim=2)
|
||||
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)
|
||||
|
||||
clean_mod = clean_mod - mean_clean_mod.unsqueeze(2)
|
||||
enhanced_mod = enhanced_mod - mean_enhanced_mod.unsqueeze(2)
|
||||
|
||||
if weight is None:
|
||||
alpha = 1
|
||||
else: # TF-mask weight
|
||||
alpha = 1 + torch.sum(weight, dim=-1, keepdim=True).unsqueeze(1)
|
||||
mod_mse_loss = self.mse(enhanced_mod, clean_mod) * alpha
|
||||
mod_mse_loss = torch.mean(
|
||||
torch.sum(mod_mse_loss, dim=(1, 2, 3))
|
||||
/ torch.sum(clean_mod**2, dim=(1, 2, 3)))
|
||||
|
||||
return mod_mse_loss
|
||||
|
||||
|
||||
class ModulationDomainNCCLossModule(torch.nn.Module):
|
||||
"""Modulation-domain loss function developed in [1] for supervised speech enhancement
|
||||
|
||||
# Speech Intelligibility Prediction Using Spectro-Temporal Modulation Analysis - based off of this
|
||||
|
||||
In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
|
||||
as the input spectrogram representation.
|
||||
Specific parameter details are in the paper and in the example below
|
||||
|
||||
Parameters
|
||||
----------
|
||||
modulation_kernels: nn.Module
|
||||
Differentiable module that transforms a spectrogram representation to the modulation domain
|
||||
|
||||
modulation_domain = modulation_kernels(input_tf_representation)
|
||||
Input Spectrogram representation(B, T, F) --- (M) modulation_kernels---> Modulation Domain(B, M, T', F')
|
||||
|
||||
[1]
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, modulation_kernels):
|
||||
super(ModulationDomainNCCLossModule, self).__init__()
|
||||
|
||||
self.modulation_kernels = modulation_kernels
|
||||
self.mse = nn.MSELoss(reduce=False)
|
||||
|
||||
def forward(self, enhanced_spect, clean_spect):
|
||||
"""Calculate modulation-domain loss
|
||||
Args:
|
||||
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
|
||||
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
|
||||
Returns:
|
||||
Tensor: Modulation-domain loss value.
|
||||
"""
|
||||
|
||||
clean_mod = self.modulation_kernels(clean_spect)
|
||||
enhanced_mod = self.modulation_kernels(enhanced_spect)
|
||||
mean_clean_mod = torch.mean(clean_mod, dim=2)
|
||||
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)
|
||||
|
||||
normalized_clean = clean_mod - mean_clean_mod.unsqueeze(2)
|
||||
normalized_enhanced = enhanced_mod - mean_enhanced_mod.unsqueeze(2)
|
||||
|
||||
inner_product = torch.sum(
|
||||
normalized_clean * normalized_enhanced, dim=2)
|
||||
normalized_denom = (torch.sum(
|
||||
normalized_clean * normalized_clean, dim=2))**.5 * (torch.sum(
|
||||
normalized_enhanced * normalized_enhanced, dim=2))**.5
|
||||
|
||||
ncc = inner_product / normalized_denom
|
||||
mod_mse_loss = torch.mean((ncc - 1.0)**2)
|
||||
|
||||
return mod_mse_loss
|
||||
|
||||
|
||||
class GaborSTRFConv(nn.Module):
|
||||
"""Gabor-STRF-based cross-correlation kernel."""
|
||||
|
||||
def __init__(self,
|
||||
supn,
|
||||
supk,
|
||||
nkern,
|
||||
rates=None,
|
||||
scales=None,
|
||||
norm_strf=True,
|
||||
real_only=False):
|
||||
"""Instantiate a Gabor-based STRF convolution layer.
|
||||
Parameters
|
||||
----------
|
||||
supn: int
|
||||
Time support in number of frames. Also the window length.
|
||||
supk: int
|
||||
Frequency support in number of channels. Also the window length.
|
||||
nkern: int
|
||||
Number of kernels, each with a learnable rate and scale.
|
||||
rates: list of float, None
|
||||
Initial values for temporal modulation.
|
||||
scales: list of float, None
|
||||
Initial values for spectral modulation.
|
||||
norm_strf: Boolean
|
||||
Normalize STRF kernels to be unit length
|
||||
real_only: Boolean
|
||||
If True, nkern REAL gabor-STRF kernels
|
||||
If False, nkern//2 REAL and nkern//2 IMAGINARY gabor-STRF kernels
|
||||
"""
|
||||
super(GaborSTRFConv, self).__init__()
|
||||
self.numN = supn
|
||||
self.numK = supk
|
||||
self.numKern = nkern
|
||||
self.real_only = real_only
|
||||
self.norm_strf = norm_strf
|
||||
|
||||
if not real_only:
|
||||
nkern = nkern // 2
|
||||
|
||||
if supk % 2 == 0: # force odd number
|
||||
supk += 1
|
||||
self.supk = torch.arange(supk, dtype=torch.float32)
|
||||
if supn % 2 == 0: # force odd number
|
||||
supn += 1
|
||||
self.supn = torch.arange(supn, dtype=self.supk.dtype)
|
||||
self.padding = (supn // 2, supk // 2)
|
||||
# Set up learnable parameters
|
||||
# for param in (rates, scales):
|
||||
# assert (not param) or len(param) == nkern
|
||||
if not rates:
|
||||
|
||||
rates = torch.rand(nkern) * math.pi / 2.0
|
||||
|
||||
if not scales:
|
||||
|
||||
scales = (torch.rand(nkern) * 2.0 - 1.0) * math.pi / 2.0
|
||||
|
||||
self.rates_ = nn.Parameter(torch.Tensor(rates))
|
||||
self.scales_ = nn.Parameter(torch.Tensor(scales))
|
||||
|
||||
def strfs(self):
|
||||
"""Make STRFs using the current parameters."""
|
||||
|
||||
if self.supn.device != self.rates_.device: # for first run
|
||||
self.supn = self.supn.to(self.rates_.device)
|
||||
self.supk = self.supk.to(self.rates_.device)
|
||||
n0, k0 = self.padding
|
||||
|
||||
nwind = .5 - .5 * \
|
||||
torch.cos(2 * math.pi * (self.supn + 1) / (len(self.supn) + 1))
|
||||
kwind = .5 - .5 * \
|
||||
torch.cos(2 * math.pi * (self.supk + 1) / (len(self.supk) + 1))
|
||||
|
||||
new_wind = torch.matmul((nwind).unsqueeze(-1), (kwind).unsqueeze(0))
|
||||
|
||||
n_n_0 = self.supn - n0
|
||||
k_k_0 = self.supk - k0
|
||||
n_mult = torch.matmul(
|
||||
n_n_0.unsqueeze(1),
|
||||
torch.ones((1, len(self.supk))).type(torch.FloatTensor).to(
|
||||
self.rates_.device))
|
||||
k_mult = torch.matmul(
|
||||
torch.ones((len(self.supn),
|
||||
1)).type(torch.FloatTensor).to(self.rates_.device),
|
||||
k_k_0.unsqueeze(0))
|
||||
|
||||
inside = self.rates_.unsqueeze(1).unsqueeze(
|
||||
1) * n_mult + self.scales_.unsqueeze(1).unsqueeze(1) * k_mult
|
||||
real_strf = torch.cos(inside) * new_wind.unsqueeze(0)
|
||||
|
||||
if self.real_only:
|
||||
final_strf = real_strf
|
||||
|
||||
else:
|
||||
imag_strf = torch.sin(inside) * new_wind.unsqueeze(0)
|
||||
final_strf = torch.cat([real_strf, imag_strf], dim=0)
|
||||
|
||||
if self.norm_strf:
|
||||
final_strf = final_strf / (torch.sum(
|
||||
final_strf**2, dim=(1, 2)).unsqueeze(1).unsqueeze(2))**.5
|
||||
|
||||
return final_strf
|
||||
|
||||
def forward(self, sigspec):
|
||||
"""Forward pass a batch of (real) spectra [Batch x Time x Frequency]."""
|
||||
if len(sigspec.shape) == 2: # expand batch dimension if single eg
|
||||
sigspec = sigspec.unsqueeze(0)
|
||||
strfs = self.strfs().unsqueeze(1).type_as(sigspec)
|
||||
out = F.conv2d(sigspec.unsqueeze(1), strfs, padding=self.padding)
|
||||
return out
|
||||
|
||||
def __repr__(self):
|
||||
"""Gabor filter"""
|
||||
report = """
|
||||
+++++ Gabor Filter Kernels [{}], supn[{}], supk[{}] real only [{}] norm strf [{}] +++++
|
||||
|
||||
""".format(self.numKern, self.numN, self.numK, self.real_only,
|
||||
self.norm_strf)
|
||||
|
||||
return report
|
||||
483
modelscope/models/audio/network/se_net.py
Normal file
483
modelscope/models/audio/network/se_net.py
Normal file
@@ -0,0 +1,483 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ..layers.activations import RectifiedLinear, Sigmoid
|
||||
from ..layers.affine_transform import AffineTransform
|
||||
from ..layers.deep_fsmn import DeepFsmn
|
||||
from ..layers.uni_deep_fsmn import Conv2d, UniDeepFsmn
|
||||
|
||||
|
||||
class MaskNet(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
hidden_dim=128,
|
||||
hidden_dim2=None,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(MaskNet, self).__init__()
|
||||
|
||||
self.linear1 = AffineTransform(indim, hidden_dim)
|
||||
self.relu = RectifiedLinear(hidden_dim, hidden_dim)
|
||||
if hidden_dim2 is None:
|
||||
hidden_dim2 = hidden_dim
|
||||
|
||||
if rorder == 0:
|
||||
repeats = [
|
||||
UniDeepFsmn(
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
lorder,
|
||||
hidden_dim2,
|
||||
dilation=dilation,
|
||||
layer_norm=layer_norm,
|
||||
dropout=dropout) for i in range(layers)
|
||||
]
|
||||
else:
|
||||
repeats = [
|
||||
DeepFsmn(
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
lorder,
|
||||
rorder,
|
||||
hidden_dim2,
|
||||
layer_norm=layer_norm,
|
||||
dropout=dropout) for i in range(layers)
|
||||
]
|
||||
self.deepfsmn = nn.Sequential(*repeats)
|
||||
|
||||
self.linear2 = AffineTransform(hidden_dim, outdim)
|
||||
|
||||
self.crm = crm
|
||||
if self.crm:
|
||||
self.sig = nn.Tanh()
|
||||
else:
|
||||
self.sig = Sigmoid(outdim, outdim)
|
||||
|
||||
self.vad = vad
|
||||
if self.vad:
|
||||
self.linear3 = AffineTransform(hidden_dim, 1)
|
||||
|
||||
self.layers = layers
|
||||
self.linearout = linearout
|
||||
if self.linearout and self.vad:
|
||||
print('Warning: not supported nnet')
|
||||
|
||||
def forward(self, feat, ctl=None):
|
||||
x1 = self.linear1(feat)
|
||||
x2 = self.relu(x1)
|
||||
if ctl is not None:
|
||||
ctl = min(ctl, self.layers - 1)
|
||||
for i in range(ctl):
|
||||
x2 = self.deepfsmn[i](x2)
|
||||
mask = self.sig(self.linear2(x2))
|
||||
if self.vad:
|
||||
vad = torch.sigmoid(self.linear3(x2))
|
||||
return mask, vad
|
||||
else:
|
||||
return mask
|
||||
x3 = self.deepfsmn(x2)
|
||||
if self.linearout:
|
||||
return self.linear2(x3)
|
||||
mask = self.sig(self.linear2(x3))
|
||||
if self.vad:
|
||||
vad = torch.sigmoid(self.linear3(x3))
|
||||
return mask, vad
|
||||
else:
|
||||
return mask
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<Nnet>\n'
|
||||
re_str += self.linear1.to_kaldi_nnet()
|
||||
re_str += self.relu.to_kaldi_nnet()
|
||||
for dfsmn in self.deepfsmn:
|
||||
re_str += dfsmn.to_kaldi_nnet()
|
||||
re_str += self.linear2.to_kaldi_nnet()
|
||||
re_str += self.sig.to_kaldi_nnet()
|
||||
re_str += '</Nnet>\n'
|
||||
|
||||
return re_str
|
||||
|
||||
def to_raw_nnet(self, fid):
|
||||
self.linear1.to_raw_nnet(fid)
|
||||
for dfsmn in self.deepfsmn:
|
||||
dfsmn.to_raw_nnet(fid)
|
||||
self.linear2.to_raw_nnet(fid)
|
||||
|
||||
|
||||
class StageNet(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
layers2=6,
|
||||
hidden_dim=128,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(StageNet, self).__init__()
|
||||
|
||||
self.stage1 = nn.ModuleList()
|
||||
self.stage2 = nn.ModuleList()
|
||||
layer = nn.Sequential(nn.Linear(indim, hidden_dim), nn.ReLU())
|
||||
self.stage1.append(layer)
|
||||
for i in range(layers):
|
||||
layer = UniDeepFsmn(
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
lorder,
|
||||
hidden_dim,
|
||||
layer_norm=layer_norm,
|
||||
dropout=dropout)
|
||||
self.stage1.append(layer)
|
||||
layer = nn.Sequential(nn.Linear(hidden_dim, 321), nn.Sigmoid())
|
||||
self.stage1.append(layer)
|
||||
# stage2
|
||||
layer = nn.Sequential(nn.Linear(321 + indim, hidden_dim), nn.ReLU())
|
||||
self.stage2.append(layer)
|
||||
for i in range(layers2):
|
||||
layer = UniDeepFsmn(
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
lorder,
|
||||
hidden_dim,
|
||||
layer_norm=layer_norm,
|
||||
dropout=dropout)
|
||||
self.stage2.append(layer)
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(hidden_dim, outdim),
|
||||
nn.Sigmoid() if not crm else nn.Tanh())
|
||||
self.stage2.append(layer)
|
||||
self.crm = crm
|
||||
self.vad = vad
|
||||
self.linearout = linearout
|
||||
self.window = torch.hamming_window(640, periodic=False).cuda()
|
||||
self.freezed = False
|
||||
|
||||
def freeze(self):
|
||||
if not self.freezed:
|
||||
for param in self.stage1.parameters():
|
||||
param.requires_grad = False
|
||||
self.freezed = True
|
||||
print('freezed stage1')
|
||||
|
||||
def forward(self, feat, mixture, ctl=None):
|
||||
if ctl == 'off':
|
||||
x = feat
|
||||
for i in range(len(self.stage1)):
|
||||
x = self.stage1[i](x)
|
||||
return x
|
||||
else:
|
||||
self.freeze()
|
||||
x = feat
|
||||
for i in range(len(self.stage1)):
|
||||
x = self.stage1[i](x)
|
||||
|
||||
spec = torch.stft(
|
||||
mixture / 32768,
|
||||
640,
|
||||
320,
|
||||
640,
|
||||
self.window,
|
||||
center=False,
|
||||
return_complex=True)
|
||||
spec = torch.view_as_real(spec).permute([0, 2, 1, 3])
|
||||
specmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2)
|
||||
est = x * specmag
|
||||
y = torch.cat([est, feat], dim=-1)
|
||||
for i in range(len(self.stage2)):
|
||||
y = self.stage2[i](y)
|
||||
return y
|
||||
|
||||
|
||||
class Unet(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
dims=[256] * 4,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(Unet, self).__init__()
|
||||
|
||||
self.linear1 = AffineTransform(indim, dims[0])
|
||||
self.relu = RectifiedLinear(dims[0], dims[0])
|
||||
|
||||
self.encoder = nn.ModuleList()
|
||||
self.decoder = nn.ModuleList()
|
||||
for i in range(len(dims) - 1):
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(dims[i], dims[i + 1]), nn.ReLU(),
|
||||
nn.Linear(dims[i + 1], dims[i + 1], bias=False),
|
||||
Conv2d(
|
||||
dims[i + 1],
|
||||
dims[i + 1],
|
||||
lorder,
|
||||
groups=dims[i + 1],
|
||||
skip_connect=True))
|
||||
self.encoder.append(layer)
|
||||
for i in range(len(dims) - 1, 0, -1):
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(dims[i] * 2, dims[i - 1]), nn.ReLU(),
|
||||
nn.Linear(dims[i - 1], dims[i - 1], bias=False),
|
||||
Conv2d(
|
||||
dims[i - 1],
|
||||
dims[i - 1],
|
||||
lorder,
|
||||
groups=dims[i - 1],
|
||||
skip_connect=True))
|
||||
self.decoder.append(layer)
|
||||
self.tf = nn.ModuleList()
|
||||
for i in range(layers - 2 * (len(dims) - 1)):
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(dims[-1], dims[-1]), nn.ReLU(),
|
||||
nn.Linear(dims[-1], dims[-1], bias=False),
|
||||
Conv2d(
|
||||
dims[-1],
|
||||
dims[-1],
|
||||
lorder,
|
||||
groups=dims[-1],
|
||||
skip_connect=True))
|
||||
self.tf.append(layer)
|
||||
|
||||
self.linear2 = AffineTransform(dims[0], outdim)
|
||||
self.crm = crm
|
||||
self.act = nn.Tanh() if self.crm else nn.Sigmoid()
|
||||
self.vad = False
|
||||
self.layers = layers
|
||||
self.linearout = linearout
|
||||
|
||||
def forward(self, x, ctl=None):
|
||||
x = self.linear1(x)
|
||||
x = self.relu(x)
|
||||
|
||||
encoder_out = []
|
||||
for i in range(len(self.encoder)):
|
||||
x = self.encoder[i](x)
|
||||
encoder_out.append(x)
|
||||
for i in range(len(self.tf)):
|
||||
x = self.tf[i](x)
|
||||
for i in range(len(self.decoder)):
|
||||
x = torch.cat([x, encoder_out[-1 - i]], dim=-1)
|
||||
x = self.decoder[i](x)
|
||||
|
||||
x = self.linear2(x)
|
||||
if self.linearout:
|
||||
return x
|
||||
return self.act(x)
|
||||
|
||||
|
||||
class BranchNet(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
hidden_dim=256,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(BranchNet, self).__init__()
|
||||
|
||||
self.linear1 = AffineTransform(indim, hidden_dim)
|
||||
self.relu = RectifiedLinear(hidden_dim, hidden_dim)
|
||||
|
||||
self.convs = nn.ModuleList()
|
||||
self.deepfsmn = nn.ModuleList()
|
||||
self.FREQ = nn.ModuleList()
|
||||
self.TIME = nn.ModuleList()
|
||||
self.br1 = nn.ModuleList()
|
||||
self.br2 = nn.ModuleList()
|
||||
for i in range(layers):
|
||||
'''
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, hidden_dim, bias=False),
|
||||
Conv2d(hidden_dim, hidden_dim, lorder,
|
||||
groups=hidden_dim, skip_connect=True)
|
||||
)
|
||||
self.deepfsmn.append(layer)
|
||||
'''
|
||||
layer = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
|
||||
self.FREQ.append(layer)
|
||||
'''
|
||||
layer = nn.GRU(hidden_dim, hidden_dim,
|
||||
batch_first=True,
|
||||
bidirectional=False)
|
||||
self.TIME.append(layer)
|
||||
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(hidden_dim, hidden_dim//2, bias=False),
|
||||
Conv2d(hidden_dim//2, hidden_dim//2, lorder,
|
||||
groups=hidden_dim//2, skip_connect=True)
|
||||
)
|
||||
self.br1.append(layer)
|
||||
layer = nn.GRU(hidden_dim, hidden_dim//2,
|
||||
batch_first=True,
|
||||
bidirectional=False)
|
||||
self.br2.append(layer)
|
||||
'''
|
||||
|
||||
self.linear2 = AffineTransform(hidden_dim, outdim)
|
||||
self.crm = crm
|
||||
self.act = nn.Tanh() if self.crm else nn.Sigmoid()
|
||||
self.vad = False
|
||||
self.layers = layers
|
||||
self.linearout = linearout
|
||||
|
||||
def forward(self, x, ctl=None):
|
||||
return self.forward_branch(x)
|
||||
|
||||
def forward_sepconv(self, x):
|
||||
x = torch.unsqueeze(x, 1)
|
||||
for i in range(len(self.convs)):
|
||||
x = self.convs[i](x)
|
||||
x = F.relu(x)
|
||||
B, C, H, W = x.shape
|
||||
x = x.permute(0, 2, 1, 3)
|
||||
x = torch.reshape(x, [B, H, C * W])
|
||||
x = self.linear1(x)
|
||||
x = self.relu(x)
|
||||
for i in range(self.layers):
|
||||
x = self.deepfsmn[i](x) + x
|
||||
x = self.linear2(x)
|
||||
return self.act(x)
|
||||
|
||||
def forward_branch(self, x):
|
||||
x = self.linear1(x)
|
||||
x = self.relu(x)
|
||||
for i in range(self.layers):
|
||||
z = self.FREQ[i](x)
|
||||
x = z + x
|
||||
x = self.linear2(x)
|
||||
if self.linearout:
|
||||
return x
|
||||
return self.act(x)
|
||||
|
||||
|
||||
class TACNet(nn.Module):
|
||||
''' transform average concatenate for ad hoc dr
|
||||
'''
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
hidden_dim=128,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(TACNet, self).__init__()
|
||||
|
||||
self.linear1 = AffineTransform(indim, hidden_dim)
|
||||
self.relu = RectifiedLinear(hidden_dim, hidden_dim)
|
||||
|
||||
if rorder == 0:
|
||||
repeats = [
|
||||
UniDeepFsmn(hidden_dim, hidden_dim, lorder, hidden_dim)
|
||||
for i in range(layers)
|
||||
]
|
||||
else:
|
||||
repeats = [
|
||||
DeepFsmn(hidden_dim, hidden_dim, lorder, rorder, hidden_dim)
|
||||
for i in range(layers)
|
||||
]
|
||||
self.deepfsmn = nn.Sequential(*repeats)
|
||||
|
||||
self.ch_transform = nn.ModuleList([])
|
||||
self.ch_average = nn.ModuleList([])
|
||||
self.ch_concat = nn.ModuleList([])
|
||||
for i in range(layers):
|
||||
self.ch_transform.append(
|
||||
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
|
||||
self.ch_average.append(
|
||||
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
|
||||
self.ch_concat.append(
|
||||
nn.Sequential(
|
||||
nn.Linear(hidden_dim * 2, hidden_dim), nn.PReLU()))
|
||||
|
||||
self.linear2 = AffineTransform(hidden_dim, outdim)
|
||||
|
||||
self.crm = crm
|
||||
if self.crm:
|
||||
self.sig = nn.Tanh()
|
||||
else:
|
||||
self.sig = Sigmoid(outdim, outdim)
|
||||
|
||||
self.vad = vad
|
||||
if self.vad:
|
||||
self.linear3 = AffineTransform(hidden_dim, 1)
|
||||
|
||||
self.layers = layers
|
||||
self.linearout = linearout
|
||||
if self.linearout and self.vad:
|
||||
print('Warning: not supported nnet')
|
||||
|
||||
def forward(self, feat, ctl=None):
|
||||
B, T, F = feat.shape
|
||||
# assume 4ch
|
||||
ch = 4
|
||||
zlist = []
|
||||
for c in range(ch):
|
||||
z = self.linear1(feat[..., c * (F // 4):(c + 1) * (F // 4)])
|
||||
z = self.relu(z)
|
||||
zlist.append(z)
|
||||
for i in range(self.layers):
|
||||
# forward
|
||||
for c in range(ch):
|
||||
zlist[c] = self.deepfsmn[i](zlist[c])
|
||||
|
||||
# transform
|
||||
olist = []
|
||||
for c in range(ch):
|
||||
z = self.ch_transform[i](zlist[c])
|
||||
olist.append(z)
|
||||
# average
|
||||
avg = 0
|
||||
for c in range(ch):
|
||||
avg = avg + olist[c]
|
||||
avg = avg / ch
|
||||
avg = self.ch_average[i](avg)
|
||||
# concate
|
||||
for c in range(ch):
|
||||
tac = torch.cat([olist[c], avg], dim=-1)
|
||||
tac = self.ch_concat[i](tac)
|
||||
zlist[c] = zlist[c] + tac
|
||||
|
||||
for c in range(ch):
|
||||
zlist[c] = self.sig(self.linear2(zlist[c]))
|
||||
mask = torch.cat(zlist, dim=-1)
|
||||
return mask
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
pass
|
||||
0
modelscope/models/audio/tts/__init__.py
Normal file
0
modelscope/models/audio/tts/__init__.py
Normal file
1
modelscope/models/audio/tts/am/__init__.py
Normal file
1
modelscope/models/audio/tts/am/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .sambert_hifi_16k import * # noqa F403
|
||||
8
modelscope/models/audio/tts/am/models/__init__.py
Executable file
8
modelscope/models/audio/tts/am/models/__init__.py
Executable file
@@ -0,0 +1,8 @@
|
||||
from .robutrans import RobuTrans
|
||||
|
||||
|
||||
def create_model(name, hparams):
|
||||
if name == 'robutrans':
|
||||
return RobuTrans(hparams)
|
||||
else:
|
||||
raise Exception('Unknown model: ' + name)
|
||||
82
modelscope/models/audio/tts/am/models/compat.py
Executable file
82
modelscope/models/audio/tts/am/models/compat.py
Executable file
@@ -0,0 +1,82 @@
|
||||
"""Functions for compatibility with different TensorFlow versions."""
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def is_tf2():
|
||||
"""Returns ``True`` if running TensorFlow 2.0."""
|
||||
return tf.__version__.startswith('2')
|
||||
|
||||
|
||||
def tf_supports(symbol):
|
||||
"""Returns ``True`` if TensorFlow defines :obj:`symbol`."""
|
||||
return _string_to_tf_symbol(symbol) is not None
|
||||
|
||||
|
||||
def tf_any(*symbols):
|
||||
"""Returns the first supported symbol."""
|
||||
for symbol in symbols:
|
||||
module = _string_to_tf_symbol(symbol)
|
||||
if module is not None:
|
||||
return module
|
||||
return None
|
||||
|
||||
|
||||
def tf_compat(v2=None, v1=None): # pylint: disable=invalid-name
|
||||
"""Returns the compatible symbol based on the current TensorFlow version.
|
||||
|
||||
Args:
|
||||
v2: The candidate v2 symbol name.
|
||||
v1: The candidate v1 symbol name.
|
||||
|
||||
Returns:
|
||||
A TensorFlow symbol.
|
||||
|
||||
Raises:
|
||||
ValueError: if no symbol can be found.
|
||||
"""
|
||||
candidates = []
|
||||
if v2 is not None:
|
||||
candidates.append(v2)
|
||||
if v1 is not None:
|
||||
candidates.append(v1)
|
||||
candidates.append('compat.v1.%s' % v1)
|
||||
symbol = tf_any(*candidates)
|
||||
if symbol is None:
|
||||
raise ValueError('Failure to resolve the TensorFlow symbol')
|
||||
return symbol
|
||||
|
||||
|
||||
def name_from_variable_scope(name=''):
|
||||
"""Creates a name prefixed by the current variable scope."""
|
||||
var_scope = tf_compat(v1='get_variable_scope')().name
|
||||
compat_name = ''
|
||||
if name:
|
||||
compat_name = '%s/' % name
|
||||
if var_scope:
|
||||
compat_name = '%s/%s' % (var_scope, compat_name)
|
||||
return compat_name
|
||||
|
||||
|
||||
def reuse():
|
||||
"""Returns ``True`` if the current variable scope is marked for reuse."""
|
||||
return tf_compat(v1='get_variable_scope')().reuse
|
||||
|
||||
|
||||
def _string_to_tf_symbol(symbol):
|
||||
modules = symbol.split('.')
|
||||
namespace = tf
|
||||
for module in modules:
|
||||
namespace = getattr(namespace, module, None)
|
||||
if namespace is None:
|
||||
return None
|
||||
return namespace
|
||||
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy')
|
||||
gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists')
|
||||
gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile')
|
||||
is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor')
|
||||
logging = tf_compat(v1='logging')
|
||||
nest = tf_compat(v2='nest', v1='contrib.framework.nest')
|
||||
273
modelscope/models/audio/tts/am/models/fsmn.py
Executable file
273
modelscope/models/audio/tts/am/models/fsmn.py
Executable file
@@ -0,0 +1,273 @@
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def build_sequence_mask(sequence_length,
|
||||
maximum_length=None,
|
||||
dtype=tf.float32):
|
||||
"""Builds the dot product mask.
|
||||
|
||||
Args:
|
||||
sequence_length: The sequence length.
|
||||
maximum_length: Optional size of the returned time dimension. Otherwise
|
||||
it is the maximum of :obj:`sequence_length`.
|
||||
dtype: The type of the mask tensor.
|
||||
|
||||
Returns:
|
||||
A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
|
||||
``[batch_size, max_length]``.
|
||||
"""
|
||||
mask = tf.sequence_mask(
|
||||
sequence_length, maxlen=maximum_length, dtype=dtype)
|
||||
|
||||
return mask
|
||||
|
||||
|
||||
def norm(inputs):
|
||||
"""Layer normalizes :obj:`inputs`."""
|
||||
return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)
|
||||
|
||||
|
||||
def pad_in_time(x, padding_shape):
|
||||
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension.
|
||||
|
||||
Agrs:
|
||||
x: [Batch, Time, Frequency]
|
||||
padding_length: padding size of constant value (0) before the time dimension
|
||||
|
||||
return:
|
||||
padded x
|
||||
"""
|
||||
|
||||
depth = x.get_shape().as_list()[-1]
|
||||
x = tf.pad(x, [[0, 0], padding_shape, [0, 0]])
|
||||
x.set_shape((None, None, depth))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def pad_in_time_right(x, padding_length):
|
||||
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension.
|
||||
|
||||
Agrs:
|
||||
x: [Batch, Time, Frequency]
|
||||
padding_length: padding size of constant value (0) before the time dimension
|
||||
|
||||
return:
|
||||
padded x
|
||||
"""
|
||||
depth = x.get_shape().as_list()[-1]
|
||||
x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
|
||||
x.set_shape((None, None, depth))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0):
|
||||
"""Implements the Transformer's "Feed Forward" layer.
|
||||
|
||||
.. math::
|
||||
|
||||
ffn(x) = max(0, x*W_1 + b_1)*W_2
|
||||
|
||||
Args:
|
||||
x: The input.
|
||||
ffn_dim: The number of units of the nonlinear transformation.
|
||||
memory_units: the number of units of linear transformation
|
||||
mode: A ``tf.estimator.ModeKeys`` mode.
|
||||
dropout: The probability to drop units from the inner transformation.
|
||||
|
||||
Returns:
|
||||
The transformed input.
|
||||
"""
|
||||
inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu)
|
||||
inner = tf.layers.dropout(
|
||||
inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN)
|
||||
outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False)
|
||||
|
||||
return outer
|
||||
|
||||
|
||||
def drop_and_add(inputs, outputs, mode, dropout=0.0):
|
||||
"""Drops units in the outputs and adds the previous values.
|
||||
|
||||
Args:
|
||||
inputs: The input of the previous layer.
|
||||
outputs: The output of the previous layer.
|
||||
mode: A ``tf.estimator.ModeKeys`` mode.
|
||||
dropout: The probability to drop units in :obj:`outputs`.
|
||||
|
||||
Returns:
|
||||
The residual and normalized output.
|
||||
"""
|
||||
outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)
|
||||
|
||||
input_dim = inputs.get_shape().as_list()[-1]
|
||||
output_dim = outputs.get_shape().as_list()[-1]
|
||||
|
||||
if input_dim == output_dim:
|
||||
outputs += inputs
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def MemoryBlock(
|
||||
inputs,
|
||||
filter_size,
|
||||
mode,
|
||||
mask=None,
|
||||
dropout=0.0,
|
||||
):
|
||||
"""
|
||||
Define the bidirectional memory block in FSMN
|
||||
|
||||
Agrs:
|
||||
inputs: The output of the previous layer. [Batch, Time, Frequency]
|
||||
filter_size: memory block filter size
|
||||
mode: Training or Evaluation
|
||||
mask: A ``tf.Tensor`` applied to the memory block output
|
||||
|
||||
return:
|
||||
output: 3-D tensor ([Batch, Time, Frequency])
|
||||
"""
|
||||
static_shape = inputs.get_shape().as_list()
|
||||
depth = static_shape[-1]
|
||||
inputs = tf.expand_dims(inputs, axis=1) # [Batch, 1, Time, Frequency]
|
||||
depthwise_filter = tf.get_variable(
|
||||
'depth_conv_w',
|
||||
shape=[1, filter_size, depth, 1],
|
||||
initializer=tf.glorot_uniform_initializer(),
|
||||
dtype=tf.float32)
|
||||
memory = tf.nn.depthwise_conv2d(
|
||||
input=inputs,
|
||||
filter=depthwise_filter,
|
||||
strides=[1, 1, 1, 1],
|
||||
padding='SAME',
|
||||
rate=[1, 1],
|
||||
data_format='NHWC')
|
||||
memory = memory + inputs
|
||||
output = tf.layers.dropout(memory, rate=dropout, training=mode)
|
||||
output = tf.reshape(
|
||||
output,
|
||||
[tf.shape(output)[0], tf.shape(output)[2], depth])
|
||||
if mask is not None:
|
||||
output = output * tf.expand_dims(mask, -1)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def MemoryBlockV2(
|
||||
inputs,
|
||||
filter_size,
|
||||
mode,
|
||||
shift=0,
|
||||
mask=None,
|
||||
dropout=0.0,
|
||||
):
|
||||
"""
|
||||
Define the bidirectional memory block in FSMN
|
||||
|
||||
Agrs:
|
||||
inputs: The output of the previous layer. [Batch, Time, Frequency]
|
||||
filter_size: memory block filter size
|
||||
mode: Training or Evaluation
|
||||
shift: left padding, to control delay
|
||||
mask: A ``tf.Tensor`` applied to the memory block output
|
||||
|
||||
return:
|
||||
output: 3-D tensor ([Batch, Time, Frequency])
|
||||
"""
|
||||
if mask is not None:
|
||||
inputs = inputs * tf.expand_dims(mask, -1)
|
||||
|
||||
static_shape = inputs.get_shape().as_list()
|
||||
depth = static_shape[-1]
|
||||
# padding
|
||||
left_padding = int(round((filter_size - 1) / 2))
|
||||
right_padding = int((filter_size - 1) / 2)
|
||||
if shift > 0:
|
||||
left_padding = left_padding + shift
|
||||
right_padding = right_padding - shift
|
||||
pad_inputs = pad_in_time(inputs, [left_padding, right_padding])
|
||||
pad_inputs = tf.expand_dims(
|
||||
pad_inputs, axis=1) # [Batch, 1, Time, Frequency]
|
||||
depthwise_filter = tf.get_variable(
|
||||
'depth_conv_w',
|
||||
shape=[1, filter_size, depth, 1],
|
||||
initializer=tf.glorot_uniform_initializer(),
|
||||
dtype=tf.float32)
|
||||
memory = tf.nn.depthwise_conv2d(
|
||||
input=pad_inputs,
|
||||
filter=depthwise_filter,
|
||||
strides=[1, 1, 1, 1],
|
||||
padding='VALID',
|
||||
rate=[1, 1],
|
||||
data_format='NHWC')
|
||||
memory = tf.reshape(
|
||||
memory,
|
||||
[tf.shape(memory)[0], tf.shape(memory)[2], depth])
|
||||
memory = memory + inputs
|
||||
output = tf.layers.dropout(memory, rate=dropout, training=mode)
|
||||
if mask is not None:
|
||||
output = output * tf.expand_dims(mask, -1)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def UniMemoryBlock(
|
||||
inputs,
|
||||
filter_size,
|
||||
mode,
|
||||
cache=None,
|
||||
mask=None,
|
||||
dropout=0.0,
|
||||
):
|
||||
"""
|
||||
Define the unidirectional memory block in FSMN
|
||||
|
||||
Agrs:
|
||||
inputs: The output of the previous layer. [Batch, Time, Frequency]
|
||||
filter_size: memory block filter size
|
||||
cache: for streaming inference
|
||||
mode: Training or Evaluation
|
||||
mask: A ``tf.Tensor`` applied to the memory block output
|
||||
dropout: dorpout factor
|
||||
return:
|
||||
output: 3-D tensor ([Batch, Time, Frequency])
|
||||
"""
|
||||
if cache is not None:
|
||||
static_shape = cache['queries'].get_shape().as_list()
|
||||
depth = static_shape[-1]
|
||||
queries = tf.slice(cache['queries'], [0, 1, 0], [
|
||||
tf.shape(cache['queries'])[0],
|
||||
tf.shape(cache['queries'])[1] - 1, depth
|
||||
])
|
||||
queries = tf.concat([queries, inputs], axis=1)
|
||||
cache['queries'] = queries
|
||||
else:
|
||||
padding_length = filter_size - 1
|
||||
queries = pad_in_time(inputs, [padding_length, 0])
|
||||
|
||||
queries = tf.expand_dims(queries, axis=1) # [Batch, 1, Time, Frequency]
|
||||
static_shape = queries.get_shape().as_list()
|
||||
depth = static_shape[-1]
|
||||
depthwise_filter = tf.get_variable(
|
||||
'depth_conv_w',
|
||||
shape=[1, filter_size, depth, 1],
|
||||
initializer=tf.glorot_uniform_initializer(),
|
||||
dtype=tf.float32)
|
||||
memory = tf.nn.depthwise_conv2d(
|
||||
input=queries,
|
||||
filter=depthwise_filter,
|
||||
strides=[1, 1, 1, 1],
|
||||
padding='VALID',
|
||||
rate=[1, 1],
|
||||
data_format='NHWC')
|
||||
memory = tf.reshape(
|
||||
memory,
|
||||
[tf.shape(memory)[0], tf.shape(memory)[2], depth])
|
||||
memory = memory + inputs
|
||||
output = tf.layers.dropout(memory, rate=dropout, training=mode)
|
||||
if mask is not None:
|
||||
output = output * tf.expand_dims(mask, -1)
|
||||
|
||||
return output
|
||||
178
modelscope/models/audio/tts/am/models/fsmn_encoder.py
Executable file
178
modelscope/models/audio/tts/am/models/fsmn_encoder.py
Executable file
@@ -0,0 +1,178 @@
|
||||
import tensorflow as tf
|
||||
|
||||
from . import fsmn
|
||||
|
||||
|
||||
class FsmnEncoder():
|
||||
"""Encoder using Fsmn
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
filter_size,
|
||||
fsmn_num_layers,
|
||||
dnn_num_layers,
|
||||
num_memory_units=512,
|
||||
ffn_inner_dim=2048,
|
||||
dropout=0.0,
|
||||
position_encoder=None):
|
||||
"""Initializes the parameters of the encoder.
|
||||
|
||||
Args:
|
||||
filter_size: the total order of memory block
|
||||
fsmn_num_layers: The number of fsmn layers.
|
||||
dnn_num_layers: The number of dnn layers
|
||||
num_units: The number of memory units.
|
||||
ffn_inner_dim: The number of units of the inner linear transformation
|
||||
in the feed forward layer.
|
||||
dropout: The probability to drop units from the outputs.
|
||||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
|
||||
apply on inputs or ``None``.
|
||||
"""
|
||||
super(FsmnEncoder, self).__init__()
|
||||
self.filter_size = filter_size
|
||||
self.fsmn_num_layers = fsmn_num_layers
|
||||
self.dnn_num_layers = dnn_num_layers
|
||||
self.num_memory_units = num_memory_units
|
||||
self.ffn_inner_dim = ffn_inner_dim
|
||||
self.dropout = dropout
|
||||
self.position_encoder = position_encoder
|
||||
|
||||
def encode(self, inputs, sequence_length=None, mode=True):
|
||||
if self.position_encoder is not None:
|
||||
inputs = self.position_encoder(inputs)
|
||||
|
||||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
|
||||
|
||||
mask = fsmn.build_sequence_mask(
|
||||
sequence_length, maximum_length=tf.shape(inputs)[1])
|
||||
|
||||
state = ()
|
||||
|
||||
for layer in range(self.fsmn_num_layers):
|
||||
with tf.variable_scope('fsmn_layer_{}'.format(layer)):
|
||||
with tf.variable_scope('ffn'):
|
||||
context = fsmn.feed_forward(
|
||||
inputs,
|
||||
self.ffn_inner_dim,
|
||||
self.num_memory_units,
|
||||
mode,
|
||||
dropout=self.dropout)
|
||||
|
||||
with tf.variable_scope('memory'):
|
||||
memory = fsmn.MemoryBlock(
|
||||
context,
|
||||
self.filter_size,
|
||||
mode,
|
||||
mask=mask,
|
||||
dropout=self.dropout)
|
||||
|
||||
memory = fsmn.drop_and_add(
|
||||
inputs, memory, mode, dropout=self.dropout)
|
||||
|
||||
inputs = memory
|
||||
state += (tf.reduce_mean(inputs, axis=1), )
|
||||
|
||||
for layer in range(self.dnn_num_layers):
|
||||
with tf.variable_scope('dnn_layer_{}'.format(layer)):
|
||||
transformed = fsmn.feed_forward(
|
||||
inputs,
|
||||
self.ffn_inner_dim,
|
||||
self.num_memory_units,
|
||||
mode,
|
||||
dropout=self.dropout)
|
||||
|
||||
inputs = transformed
|
||||
state += (tf.reduce_mean(inputs, axis=1), )
|
||||
|
||||
outputs = inputs
|
||||
return (outputs, state, sequence_length)
|
||||
|
||||
|
||||
class FsmnEncoderV2():
|
||||
"""Encoder using Fsmn
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
filter_size,
|
||||
fsmn_num_layers,
|
||||
dnn_num_layers,
|
||||
num_memory_units=512,
|
||||
ffn_inner_dim=2048,
|
||||
dropout=0.0,
|
||||
shift=0,
|
||||
position_encoder=None):
|
||||
"""Initializes the parameters of the encoder.
|
||||
|
||||
Args:
|
||||
filter_size: the total order of memory block
|
||||
fsmn_num_layers: The number of fsmn layers.
|
||||
dnn_num_layers: The number of dnn layers
|
||||
num_units: The number of memory units.
|
||||
ffn_inner_dim: The number of units of the inner linear transformation
|
||||
in the feed forward layer.
|
||||
dropout: The probability to drop units from the outputs.
|
||||
shift: left padding, to control delay
|
||||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
|
||||
apply on inputs or ``None``.
|
||||
"""
|
||||
super(FsmnEncoderV2, self).__init__()
|
||||
self.filter_size = filter_size
|
||||
self.fsmn_num_layers = fsmn_num_layers
|
||||
self.dnn_num_layers = dnn_num_layers
|
||||
self.num_memory_units = num_memory_units
|
||||
self.ffn_inner_dim = ffn_inner_dim
|
||||
self.dropout = dropout
|
||||
self.shift = shift
|
||||
if not isinstance(shift, list):
|
||||
self.shift = [shift for _ in range(self.fsmn_num_layers)]
|
||||
self.position_encoder = position_encoder
|
||||
|
||||
def encode(self, inputs, sequence_length=None, mode=True):
|
||||
if self.position_encoder is not None:
|
||||
inputs = self.position_encoder(inputs)
|
||||
|
||||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
|
||||
|
||||
mask = fsmn.build_sequence_mask(
|
||||
sequence_length, maximum_length=tf.shape(inputs)[1])
|
||||
|
||||
state = ()
|
||||
for layer in range(self.fsmn_num_layers):
|
||||
with tf.variable_scope('fsmn_layer_{}'.format(layer)):
|
||||
with tf.variable_scope('ffn'):
|
||||
context = fsmn.feed_forward(
|
||||
inputs,
|
||||
self.ffn_inner_dim,
|
||||
self.num_memory_units,
|
||||
mode,
|
||||
dropout=self.dropout)
|
||||
|
||||
with tf.variable_scope('memory'):
|
||||
memory = fsmn.MemoryBlockV2(
|
||||
context,
|
||||
self.filter_size,
|
||||
mode,
|
||||
shift=self.shift[layer],
|
||||
mask=mask,
|
||||
dropout=self.dropout)
|
||||
|
||||
memory = fsmn.drop_and_add(
|
||||
inputs, memory, mode, dropout=self.dropout)
|
||||
|
||||
inputs = memory
|
||||
state += (tf.reduce_mean(inputs, axis=1), )
|
||||
|
||||
for layer in range(self.dnn_num_layers):
|
||||
with tf.variable_scope('dnn_layer_{}'.format(layer)):
|
||||
transformed = fsmn.feed_forward(
|
||||
inputs,
|
||||
self.ffn_inner_dim,
|
||||
self.num_memory_units,
|
||||
mode,
|
||||
dropout=self.dropout)
|
||||
|
||||
inputs = transformed
|
||||
state += (tf.reduce_mean(inputs, axis=1), )
|
||||
|
||||
outputs = inputs
|
||||
return (outputs, state, sequence_length)
|
||||
160
modelscope/models/audio/tts/am/models/helpers.py
Executable file
160
modelscope/models/audio/tts/am/models/helpers.py
Executable file
@@ -0,0 +1,160 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.seq2seq import Helper
|
||||
|
||||
|
||||
class VarTestHelper(Helper):
|
||||
|
||||
def __init__(self, batch_size, inputs, dim):
|
||||
with tf.name_scope('VarTestHelper'):
|
||||
self._batch_size = batch_size
|
||||
self._inputs = inputs
|
||||
self._dim = dim
|
||||
|
||||
num_steps = tf.shape(self._inputs)[1]
|
||||
self._lengths = tf.tile([num_steps], [self._batch_size])
|
||||
|
||||
self._inputs = tf.roll(inputs, shift=-1, axis=1)
|
||||
self._init_inputs = inputs[:, 0, :]
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return self._batch_size
|
||||
|
||||
@property
|
||||
def sample_ids_shape(self):
|
||||
return tf.TensorShape([])
|
||||
|
||||
@property
|
||||
def sample_ids_dtype(self):
|
||||
return np.int32
|
||||
|
||||
def initialize(self, name=None):
|
||||
return (tf.tile([False], [self._batch_size]),
|
||||
_go_frames(self._batch_size, self._dim, self._init_inputs))
|
||||
|
||||
def sample(self, time, outputs, state, name=None):
|
||||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
|
||||
|
||||
def next_inputs(self, time, outputs, state, sample_ids, name=None):
|
||||
with tf.name_scope('VarTestHelper'):
|
||||
finished = (time + 1 >= self._lengths)
|
||||
next_inputs = tf.concat([outputs, self._inputs[:, time, :]],
|
||||
axis=-1)
|
||||
return (finished, next_inputs, state)
|
||||
|
||||
|
||||
class VarTrainingHelper(Helper):
|
||||
|
||||
def __init__(self, targets, inputs, dim):
|
||||
with tf.name_scope('VarTrainingHelper'):
|
||||
self._targets = targets # [N, T_in, 1]
|
||||
self._batch_size = tf.shape(inputs)[0] # N
|
||||
self._inputs = inputs
|
||||
self._dim = dim
|
||||
|
||||
num_steps = tf.shape(self._targets)[1]
|
||||
self._lengths = tf.tile([num_steps], [self._batch_size])
|
||||
|
||||
self._inputs = tf.roll(inputs, shift=-1, axis=1)
|
||||
self._init_inputs = inputs[:, 0, :]
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return self._batch_size
|
||||
|
||||
@property
|
||||
def sample_ids_shape(self):
|
||||
return tf.TensorShape([])
|
||||
|
||||
@property
|
||||
def sample_ids_dtype(self):
|
||||
return np.int32
|
||||
|
||||
def initialize(self, name=None):
|
||||
return (tf.tile([False], [self._batch_size]),
|
||||
_go_frames(self._batch_size, self._dim, self._init_inputs))
|
||||
|
||||
def sample(self, time, outputs, state, name=None):
|
||||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
|
||||
|
||||
def next_inputs(self, time, outputs, state, sample_ids, name=None):
|
||||
with tf.name_scope(name or 'VarTrainingHelper'):
|
||||
finished = (time + 1 >= self._lengths)
|
||||
next_inputs = tf.concat(
|
||||
[self._targets[:, time, :], self._inputs[:, time, :]], axis=-1)
|
||||
return (finished, next_inputs, state)
|
||||
|
||||
|
||||
class VarTrainingSSHelper(Helper):
|
||||
|
||||
def __init__(self, targets, inputs, dim, global_step, schedule_begin,
|
||||
alpha, decay_steps):
|
||||
with tf.name_scope('VarTrainingSSHelper'):
|
||||
self._targets = targets # [N, T_in, 1]
|
||||
self._batch_size = tf.shape(inputs)[0] # N
|
||||
self._inputs = inputs
|
||||
self._dim = dim
|
||||
|
||||
num_steps = tf.shape(self._targets)[1]
|
||||
self._lengths = tf.tile([num_steps], [self._batch_size])
|
||||
|
||||
self._inputs = tf.roll(inputs, shift=-1, axis=1)
|
||||
self._init_inputs = inputs[:, 0, :]
|
||||
|
||||
# for schedule sampling
|
||||
self._global_step = global_step
|
||||
self._schedule_begin = schedule_begin
|
||||
self._alpha = alpha
|
||||
self._decay_steps = decay_steps
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return self._batch_size
|
||||
|
||||
@property
|
||||
def sample_ids_shape(self):
|
||||
return tf.TensorShape([])
|
||||
|
||||
@property
|
||||
def sample_ids_dtype(self):
|
||||
return np.int32
|
||||
|
||||
def initialize(self, name=None):
|
||||
self._ratio = _tf_decay(self._global_step, self._schedule_begin,
|
||||
self._alpha, self._decay_steps)
|
||||
return (tf.tile([False], [self._batch_size]),
|
||||
_go_frames(self._batch_size, self._dim, self._init_inputs))
|
||||
|
||||
def sample(self, time, outputs, state, name=None):
|
||||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
|
||||
|
||||
def next_inputs(self, time, outputs, state, sample_ids, name=None):
|
||||
with tf.name_scope(name or 'VarTrainingHelper'):
|
||||
finished = (time + 1 >= self._lengths)
|
||||
next_inputs_tmp = tf.cond(
|
||||
tf.less(
|
||||
tf.random_uniform([], minval=0, maxval=1,
|
||||
dtype=tf.float32), self._ratio),
|
||||
lambda: self._targets[:, time, :], lambda: outputs)
|
||||
next_inputs = tf.concat(
|
||||
[next_inputs_tmp, self._inputs[:, time, :]], axis=-1)
|
||||
return (finished, next_inputs, state)
|
||||
|
||||
|
||||
def _go_frames(batch_size, dim, init_inputs):
|
||||
'''Returns all-zero <GO> frames for a given batch size and output dimension'''
|
||||
return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs],
|
||||
axis=-1)
|
||||
|
||||
|
||||
def _tf_decay(global_step, schedule_begin, alpha, decay_steps):
|
||||
tfr = tf.train.exponential_decay(
|
||||
1.0,
|
||||
global_step=global_step - schedule_begin,
|
||||
decay_steps=decay_steps,
|
||||
decay_rate=alpha,
|
||||
name='tfr_decay')
|
||||
final_tfr = tf.cond(
|
||||
tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr)
|
||||
return final_tfr
|
||||
461
modelscope/models/audio/tts/am/models/modules.py
Executable file
461
modelscope/models/audio/tts/am/models/modules.py
Executable file
@@ -0,0 +1,461 @@
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.cudnn_rnn import CudnnLSTM
|
||||
from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
|
||||
from tensorflow.contrib.rnn import LSTMBlockCell
|
||||
|
||||
|
||||
def encoder_prenet(inputs,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
dense_units,
|
||||
is_training,
|
||||
mask=None,
|
||||
scope='encoder_prenet'):
|
||||
x = inputs
|
||||
with tf.variable_scope(scope):
|
||||
for i in range(n_conv_layers):
|
||||
x = conv1d(
|
||||
x,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=tf.nn.relu,
|
||||
dropout=True,
|
||||
mask=mask,
|
||||
scope='conv1d_{}'.format(i))
|
||||
x = tf.layers.dense(
|
||||
x, units=dense_units, activation=None, name='dense')
|
||||
return x
|
||||
|
||||
|
||||
def decoder_prenet(inputs,
|
||||
prenet_units,
|
||||
dense_units,
|
||||
is_training,
|
||||
scope='decoder_prenet'):
|
||||
x = inputs
|
||||
with tf.variable_scope(scope):
|
||||
for i, units in enumerate(prenet_units):
|
||||
x = tf.layers.dense(
|
||||
x,
|
||||
units=units,
|
||||
activation=tf.nn.relu,
|
||||
name='dense_{}'.format(i))
|
||||
x = tf.layers.dropout(
|
||||
x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
|
||||
x = tf.layers.dense(
|
||||
x, units=dense_units, activation=None, name='dense')
|
||||
return x
|
||||
|
||||
|
||||
def encoder(inputs,
|
||||
input_lengths,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
is_training,
|
||||
embedded_inputs_speaker,
|
||||
mask=None,
|
||||
scope='encoder'):
|
||||
with tf.variable_scope(scope):
|
||||
x = conv_and_lstm(
|
||||
inputs,
|
||||
input_lengths,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
is_training,
|
||||
embedded_inputs_speaker,
|
||||
mask=mask)
|
||||
return x
|
||||
|
||||
|
||||
def prenet(inputs, prenet_units, is_training, scope='prenet'):
|
||||
x = inputs
|
||||
with tf.variable_scope(scope):
|
||||
for i, units in enumerate(prenet_units):
|
||||
x = tf.layers.dense(
|
||||
x,
|
||||
units=units,
|
||||
activation=tf.nn.relu,
|
||||
name='dense_{}'.format(i))
|
||||
x = tf.layers.dropout(
|
||||
x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
|
||||
return x
|
||||
|
||||
|
||||
def postnet_residual_ulstm(inputs,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
output_units,
|
||||
is_training,
|
||||
scope='postnet_residual_ulstm'):
|
||||
with tf.variable_scope(scope):
|
||||
x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
|
||||
lstm_units, is_training)
|
||||
x = conv1d(
|
||||
x,
|
||||
output_units,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=None,
|
||||
dropout=False,
|
||||
scope='conv1d_{}'.format(n_conv_layers - 1))
|
||||
return x
|
||||
|
||||
|
||||
def postnet_residual_lstm(inputs,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
output_units,
|
||||
is_training,
|
||||
scope='postnet_residual_lstm'):
|
||||
with tf.variable_scope(scope):
|
||||
x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size,
|
||||
lstm_units, is_training)
|
||||
x = conv1d(
|
||||
x,
|
||||
output_units,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=None,
|
||||
dropout=False,
|
||||
scope='conv1d_{}'.format(n_conv_layers - 1))
|
||||
return x
|
||||
|
||||
|
||||
def postnet_linear_ulstm(inputs,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
output_units,
|
||||
is_training,
|
||||
scope='postnet_linear'):
|
||||
with tf.variable_scope(scope):
|
||||
x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
|
||||
lstm_units, is_training)
|
||||
x = tf.layers.dense(x, units=output_units)
|
||||
return x
|
||||
|
||||
|
||||
def postnet_linear_lstm(inputs,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
output_units,
|
||||
output_lengths,
|
||||
is_training,
|
||||
embedded_inputs_speaker2,
|
||||
mask=None,
|
||||
scope='postnet_linear'):
|
||||
with tf.variable_scope(scope):
|
||||
x = conv_and_lstm_dec(
|
||||
inputs,
|
||||
output_lengths,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
is_training,
|
||||
embedded_inputs_speaker2,
|
||||
mask=mask)
|
||||
x = tf.layers.dense(x, units=output_units)
|
||||
return x
|
||||
|
||||
|
||||
def postnet_linear(inputs,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
output_units,
|
||||
output_lengths,
|
||||
is_training,
|
||||
embedded_inputs_speaker2,
|
||||
mask=None,
|
||||
scope='postnet_linear'):
|
||||
with tf.variable_scope(scope):
|
||||
x = conv_dec(
|
||||
inputs,
|
||||
output_lengths,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
is_training,
|
||||
embedded_inputs_speaker2,
|
||||
mask=mask)
|
||||
return x
|
||||
|
||||
|
||||
def conv_and_lstm(inputs,
|
||||
sequence_lengths,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
is_training,
|
||||
embedded_inputs_speaker,
|
||||
mask=None,
|
||||
scope='conv_and_lstm'):
|
||||
x = inputs
|
||||
with tf.variable_scope(scope):
|
||||
for i in range(n_conv_layers):
|
||||
x = conv1d(
|
||||
x,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=tf.nn.relu,
|
||||
dropout=True,
|
||||
mask=mask,
|
||||
scope='conv1d_{}'.format(i))
|
||||
|
||||
x = tf.concat([x, embedded_inputs_speaker], axis=2)
|
||||
|
||||
outputs, states = tf.nn.bidirectional_dynamic_rnn(
|
||||
LSTMBlockCell(lstm_units),
|
||||
LSTMBlockCell(lstm_units),
|
||||
x,
|
||||
sequence_length=sequence_lengths,
|
||||
dtype=tf.float32)
|
||||
x = tf.concat(outputs, axis=-1)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def conv_and_lstm_dec(inputs,
|
||||
sequence_lengths,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
is_training,
|
||||
embedded_inputs_speaker2,
|
||||
mask=None,
|
||||
scope='conv_and_lstm'):
|
||||
x = inputs
|
||||
with tf.variable_scope(scope):
|
||||
for i in range(n_conv_layers):
|
||||
x = conv1d(
|
||||
x,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=tf.nn.relu,
|
||||
dropout=True,
|
||||
mask=mask,
|
||||
scope='conv1d_{}'.format(i))
|
||||
|
||||
x = tf.concat([x, embedded_inputs_speaker2], axis=2)
|
||||
|
||||
outputs, states = tf.nn.bidirectional_dynamic_rnn(
|
||||
LSTMBlockCell(lstm_units),
|
||||
LSTMBlockCell(lstm_units),
|
||||
x,
|
||||
sequence_length=sequence_lengths,
|
||||
dtype=tf.float32)
|
||||
x = tf.concat(outputs, axis=-1)
|
||||
return x
|
||||
|
||||
|
||||
def conv_dec(inputs,
|
||||
sequence_lengths,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
is_training,
|
||||
embedded_inputs_speaker2,
|
||||
mask=None,
|
||||
scope='conv_and_lstm'):
|
||||
x = inputs
|
||||
with tf.variable_scope(scope):
|
||||
for i in range(n_conv_layers):
|
||||
x = conv1d(
|
||||
x,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=tf.nn.relu,
|
||||
dropout=True,
|
||||
mask=mask,
|
||||
scope='conv1d_{}'.format(i))
|
||||
x = tf.concat([x, embedded_inputs_speaker2], axis=2)
|
||||
return x
|
||||
|
||||
|
||||
def conv_and_ulstm(inputs,
|
||||
sequence_lengths,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
is_training,
|
||||
scope='conv_and_ulstm'):
|
||||
x = inputs
|
||||
with tf.variable_scope(scope):
|
||||
for i in range(n_conv_layers):
|
||||
x = conv1d(
|
||||
x,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=tf.nn.relu,
|
||||
dropout=True,
|
||||
scope='conv1d_{}'.format(i))
|
||||
|
||||
outputs, states = tf.nn.dynamic_rnn(
|
||||
LSTMBlockCell(lstm_units),
|
||||
x,
|
||||
sequence_length=sequence_lengths,
|
||||
dtype=tf.float32)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def conv1d(inputs,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=None,
|
||||
dropout=False,
|
||||
mask=None,
|
||||
scope='conv1d'):
|
||||
with tf.variable_scope(scope):
|
||||
if mask is not None:
|
||||
inputs = inputs * tf.expand_dims(mask, -1)
|
||||
x = tf.layers.conv1d(
|
||||
inputs, filters=filters, kernel_size=kernel_size, padding='same')
|
||||
if mask is not None:
|
||||
x = x * tf.expand_dims(mask, -1)
|
||||
|
||||
x = tf.layers.batch_normalization(x, training=is_training)
|
||||
if activation is not None:
|
||||
x = activation(x)
|
||||
if dropout:
|
||||
x = tf.layers.dropout(x, rate=0.5, training=is_training)
|
||||
return x
|
||||
|
||||
|
||||
def conv1d_dp(inputs,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=None,
|
||||
dropout=False,
|
||||
dropoutrate=0.5,
|
||||
mask=None,
|
||||
scope='conv1d'):
|
||||
with tf.variable_scope(scope):
|
||||
if mask is not None:
|
||||
inputs = inputs * tf.expand_dims(mask, -1)
|
||||
x = tf.layers.conv1d(
|
||||
inputs, filters=filters, kernel_size=kernel_size, padding='same')
|
||||
if mask is not None:
|
||||
x = x * tf.expand_dims(mask, -1)
|
||||
|
||||
x = tf.contrib.layers.layer_norm(x)
|
||||
if activation is not None:
|
||||
x = activation(x)
|
||||
if dropout:
|
||||
x = tf.layers.dropout(x, rate=dropoutrate, training=is_training)
|
||||
return x
|
||||
|
||||
|
||||
def duration_predictor(inputs,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
lstm_units,
|
||||
input_lengths,
|
||||
is_training,
|
||||
embedded_inputs_speaker,
|
||||
mask=None,
|
||||
scope='duration_predictor'):
|
||||
with tf.variable_scope(scope):
|
||||
x = inputs
|
||||
for i in range(n_conv_layers):
|
||||
x = conv1d_dp(
|
||||
x,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=tf.nn.relu,
|
||||
dropout=True,
|
||||
dropoutrate=0.1,
|
||||
mask=mask,
|
||||
scope='conv1d_{}'.format(i))
|
||||
|
||||
x = tf.concat([x, embedded_inputs_speaker], axis=2)
|
||||
|
||||
outputs, states = tf.nn.bidirectional_dynamic_rnn(
|
||||
LSTMBlockCell(lstm_units),
|
||||
LSTMBlockCell(lstm_units),
|
||||
x,
|
||||
sequence_length=input_lengths,
|
||||
dtype=tf.float32)
|
||||
x = tf.concat(outputs, axis=-1)
|
||||
|
||||
x = tf.layers.dense(x, units=1)
|
||||
x = tf.nn.relu(x)
|
||||
return x
|
||||
|
||||
|
||||
def duration_predictor2(inputs,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
input_lengths,
|
||||
is_training,
|
||||
mask=None,
|
||||
scope='duration_predictor'):
|
||||
with tf.variable_scope(scope):
|
||||
x = inputs
|
||||
for i in range(n_conv_layers):
|
||||
x = conv1d_dp(
|
||||
x,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=tf.nn.relu,
|
||||
dropout=True,
|
||||
dropoutrate=0.1,
|
||||
mask=mask,
|
||||
scope='conv1d_{}'.format(i))
|
||||
|
||||
x = tf.layers.dense(x, units=1)
|
||||
x = tf.nn.relu(x)
|
||||
return x
|
||||
|
||||
|
||||
def conv_prenet(inputs,
|
||||
n_conv_layers,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
mask=None,
|
||||
scope='conv_prenet'):
|
||||
x = inputs
|
||||
with tf.variable_scope(scope):
|
||||
for i in range(n_conv_layers):
|
||||
x = conv1d(
|
||||
x,
|
||||
filters,
|
||||
kernel_size,
|
||||
is_training,
|
||||
activation=tf.nn.relu,
|
||||
dropout=True,
|
||||
mask=mask,
|
||||
scope='conv1d_{}'.format(i))
|
||||
|
||||
return x
|
||||
174
modelscope/models/audio/tts/am/models/position.py
Executable file
174
modelscope/models/audio/tts/am/models/position.py
Executable file
@@ -0,0 +1,174 @@
|
||||
"""Define position encoder classes."""
|
||||
|
||||
import abc
|
||||
import math
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from .reducer import SumReducer
|
||||
|
||||
|
||||
class PositionEncoder(tf.keras.layers.Layer):
|
||||
"""Base class for position encoders."""
|
||||
|
||||
def __init__(self, reducer=None, **kwargs):
|
||||
"""Initializes the position encoder.
|
||||
Args:
|
||||
reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
|
||||
encodings. Defaults to :class:`opennmt.layers.SumReducer`.
|
||||
**kwargs: Additional layer keyword arguments.
|
||||
"""
|
||||
super(PositionEncoder, self).__init__(**kwargs)
|
||||
if reducer is None:
|
||||
reducer = SumReducer(dtype=kwargs.get('dtype'))
|
||||
self.reducer = reducer
|
||||
|
||||
def call(self, inputs, position=None): # pylint: disable=arguments-differ
|
||||
"""Add position encodings to :obj:`inputs`.
|
||||
Args:
|
||||
inputs: The inputs to encode.
|
||||
position: The single position to encode, to use when this layer is called
|
||||
step by step.
|
||||
Returns:
|
||||
A ``tf.Tensor`` whose shape depends on the configured ``reducer``.
|
||||
"""
|
||||
batch_size = tf.shape(inputs)[0]
|
||||
timesteps = tf.shape(inputs)[1]
|
||||
input_dim = inputs.shape[-1].value
|
||||
positions = tf.range(timesteps) + 1 if position is None else [position]
|
||||
position_encoding = self._encode([positions], input_dim)
|
||||
position_encoding = tf.tile(position_encoding, [batch_size, 1, 1])
|
||||
return self.reducer([inputs, position_encoding])
|
||||
|
||||
@abc.abstractmethod
|
||||
def _encode(self, positions, depth):
|
||||
"""Creates position encodings.
|
||||
Args:
|
||||
positions: The positions to encode of shape :math:`[B, ...]`.
|
||||
depth: The encoding depth :math:`D`.
|
||||
Returns:
|
||||
A ``tf.Tensor`` of shape :math:`[B, ..., D]`.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class PositionEmbedder(PositionEncoder):
|
||||
"""Encodes position with a lookup table."""
|
||||
|
||||
def __init__(self, maximum_position=128, reducer=None, **kwargs):
|
||||
"""Initializes the position encoder.
|
||||
Args:
|
||||
maximum_position: The maximum position to embed. Positions greater
|
||||
than this value will be set to :obj:`maximum_position`.
|
||||
reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
|
||||
encodings. Defaults to :class:`opennmt.layers.SumReducer`.
|
||||
**kwargs: Additional layer keyword arguments.
|
||||
"""
|
||||
super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs)
|
||||
self.maximum_position = maximum_position
|
||||
self.embedding = None
|
||||
|
||||
def build(self, input_shape):
|
||||
shape = [self.maximum_position + 1, input_shape[-1]]
|
||||
self.embedding = self.add_weight('position_embedding', shape)
|
||||
super(PositionEmbedder, self).build(input_shape)
|
||||
|
||||
def _encode(self, positions, depth):
|
||||
positions = tf.minimum(positions, self.maximum_position)
|
||||
return tf.nn.embedding_lookup(self.embedding, positions)
|
||||
|
||||
|
||||
class SinusoidalPositionEncoder(PositionEncoder):
|
||||
"""Encodes positions with sine waves as described in
|
||||
https://arxiv.org/abs/1706.03762.
|
||||
"""
|
||||
|
||||
def _encode(self, positions, depth):
|
||||
if depth % 2 != 0:
|
||||
raise ValueError(
|
||||
'SinusoidalPositionEncoder expects the depth to be divisble '
|
||||
'by 2 but got %d' % depth)
|
||||
|
||||
batch_size = tf.shape(positions)[0]
|
||||
positions = tf.cast(positions, tf.float32)
|
||||
|
||||
log_timescale_increment = math.log(10000) / (depth / 2 - 1)
|
||||
inv_timescales = tf.exp(
|
||||
tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment)
|
||||
inv_timescales = tf.reshape(
|
||||
tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2])
|
||||
scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims(
|
||||
inv_timescales, 1)
|
||||
encoding = tf.concat(
|
||||
[tf.sin(scaled_time), tf.cos(scaled_time)], axis=2)
|
||||
return tf.cast(encoding, self.dtype)
|
||||
|
||||
|
||||
class SinusodalPositionalEncoding(tf.keras.layers.Layer):
|
||||
|
||||
def __init__(self, name='SinusodalPositionalEncoding'):
|
||||
super(SinusodalPositionalEncoding, self).__init__(name=name)
|
||||
|
||||
@staticmethod
|
||||
def positional_encoding(len, dim, step=1.):
|
||||
"""
|
||||
:param len: int scalar
|
||||
:param dim: int scalar
|
||||
:param step:
|
||||
:return: position embedding
|
||||
"""
|
||||
pos_mat = tf.tile(
|
||||
tf.expand_dims(
|
||||
tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32)
|
||||
* step,
|
||||
axis=-1), [1, dim])
|
||||
dim_mat = tf.tile(
|
||||
tf.expand_dims(
|
||||
tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
|
||||
axis=0), [len, 1])
|
||||
dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
|
||||
pos_encoding = tf.where( # [time, dims]
|
||||
tf.math.equal(tf.math.mod(dim_mat_int, 2), 0),
|
||||
x=tf.math.sin(
|
||||
pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
|
||||
y=tf.math.cos(pos_mat
|
||||
/ tf.pow(10000.,
|
||||
(dim_mat - 1) / tf.cast(dim, tf.float32))))
|
||||
return pos_encoding
|
||||
|
||||
|
||||
class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer):
|
||||
|
||||
def __init__(self, name='BatchSinusodalPositionalEncoding'):
|
||||
super(BatchSinusodalPositionalEncoding, self).__init__(name=name)
|
||||
|
||||
@staticmethod
|
||||
def positional_encoding(batch_size, len, dim, pos_mat, step=1.):
|
||||
"""
|
||||
:param len: int scalar
|
||||
:param dim: int scalar
|
||||
:param step:
|
||||
:param pos_mat: [B, len] = [len, 1] * dim
|
||||
:return: position embedding
|
||||
"""
|
||||
pos_mat = tf.tile(
|
||||
tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1),
|
||||
[1, 1, dim]) # [B, len, dim]
|
||||
|
||||
dim_mat = tf.tile(
|
||||
tf.expand_dims(
|
||||
tf.expand_dims(
|
||||
tf.range(
|
||||
0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
|
||||
axis=0),
|
||||
axis=0), [batch_size, len, 1]) # [B, len, dim]
|
||||
|
||||
dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
|
||||
pos_encoding = tf.where( # [B, time, dims]
|
||||
tf.math.equal(tf.mod(dim_mat_int, 2), 0),
|
||||
x=tf.math.sin(
|
||||
pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
|
||||
y=tf.math.cos(pos_mat
|
||||
/ tf.pow(10000.,
|
||||
(dim_mat - 1) / tf.cast(dim, tf.float32))))
|
||||
return pos_encoding
|
||||
155
modelscope/models/audio/tts/am/models/reducer.py
Executable file
155
modelscope/models/audio/tts/am/models/reducer.py
Executable file
@@ -0,0 +1,155 @@
|
||||
"""Define reducers: objects that merge inputs."""
|
||||
|
||||
import abc
|
||||
import functools
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def pad_in_time(x, padding_length):
|
||||
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension."""
|
||||
return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
|
||||
|
||||
|
||||
def align_in_time(x, length):
|
||||
"""Aligns the time dimension of :obj:`x` with :obj:`length`."""
|
||||
time_dim = tf.shape(x)[1]
|
||||
return tf.cond(
|
||||
tf.less(time_dim, length),
|
||||
true_fn=lambda: pad_in_time(x, length - time_dim),
|
||||
false_fn=lambda: x[:, :length])
|
||||
|
||||
|
||||
def pad_with_identity(x,
|
||||
sequence_length,
|
||||
max_sequence_length,
|
||||
identity_values=0,
|
||||
maxlen=None):
|
||||
"""Pads a tensor with identity values up to :obj:`max_sequence_length`.
|
||||
Args:
|
||||
x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``.
|
||||
sequence_length: The true sequence length of :obj:`x`.
|
||||
max_sequence_length: The sequence length up to which the tensor must contain
|
||||
:obj:`identity values`.
|
||||
identity_values: The identity value.
|
||||
maxlen: Size of the output time dimension. Default is the maximum value in
|
||||
obj:`max_sequence_length`.
|
||||
Returns:
|
||||
A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``.
|
||||
"""
|
||||
if maxlen is None:
|
||||
maxlen = tf.reduce_max(max_sequence_length)
|
||||
|
||||
mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype)
|
||||
mask = tf.expand_dims(mask, axis=-1)
|
||||
mask_combined = tf.sequence_mask(
|
||||
max_sequence_length, maxlen=maxlen, dtype=x.dtype)
|
||||
mask_combined = tf.expand_dims(mask_combined, axis=-1)
|
||||
|
||||
identity_mask = mask_combined * (1.0 - mask)
|
||||
|
||||
x = pad_in_time(x, maxlen - tf.shape(x)[1])
|
||||
x = x * mask + (identity_mask * identity_values)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def pad_n_with_identity(inputs, sequence_lengths, identity_values=0):
|
||||
"""Pads each input tensors with identity values up to
|
||||
``max(sequence_lengths)`` for each batch.
|
||||
Args:
|
||||
inputs: A list of ``tf.Tensor``.
|
||||
sequence_lengths: A list of sequence length.
|
||||
identity_values: The identity value.
|
||||
Returns:
|
||||
A tuple ``(padded, max_sequence_length)`` which are respectively a list of
|
||||
``tf.Tensor`` where each tensor are padded with identity and the combined
|
||||
sequence length.
|
||||
"""
|
||||
max_sequence_length = tf.reduce_max(sequence_lengths, axis=0)
|
||||
maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs])
|
||||
padded = [
|
||||
pad_with_identity(
|
||||
x,
|
||||
length,
|
||||
max_sequence_length,
|
||||
identity_values=identity_values,
|
||||
maxlen=maxlen) for x, length in zip(inputs, sequence_lengths)
|
||||
]
|
||||
return padded, max_sequence_length
|
||||
|
||||
|
||||
class Reducer(tf.keras.layers.Layer):
|
||||
"""Base class for reducers."""
|
||||
|
||||
def zip_and_reduce(self, x, y):
|
||||
"""Zips the :obj:`x` with :obj:`y` structures together and reduces all
|
||||
elements. If the structures are nested, they will be flattened first.
|
||||
Args:
|
||||
x: The first structure.
|
||||
y: The second structure.
|
||||
Returns:
|
||||
The same structure as :obj:`x` and :obj:`y` where each element from
|
||||
:obj:`x` is reduced with the correspond element from :obj:`y`.
|
||||
Raises:
|
||||
ValueError: if the two structures are not the same.
|
||||
"""
|
||||
tf.nest.assert_same_structure(x, y)
|
||||
x_flat = tf.nest.flatten(x)
|
||||
y_flat = tf.nest.flatten(y)
|
||||
reduced = list(map(self, zip(x_flat, y_flat)))
|
||||
return tf.nest.pack_sequence_as(x, reduced)
|
||||
|
||||
def call(self, inputs, sequence_length=None): # pylint: disable=arguments-differ
|
||||
"""Reduces all input elements.
|
||||
Args:
|
||||
inputs: A list of ``tf.Tensor``.
|
||||
sequence_length: The length of each input, if reducing sequences.
|
||||
Returns:
|
||||
If :obj:`sequence_length` is set, a tuple
|
||||
``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor``
|
||||
only.
|
||||
"""
|
||||
if sequence_length is None:
|
||||
return self.reduce(inputs)
|
||||
else:
|
||||
return self.reduce_sequence(
|
||||
inputs, sequence_lengths=sequence_length)
|
||||
|
||||
@abc.abstractmethod
|
||||
def reduce(self, inputs):
|
||||
"""See :meth:`opennmt.layers.Reducer.__call__`."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def reduce_sequence(self, inputs, sequence_lengths):
|
||||
"""See :meth:`opennmt.layers.Reducer.__call__`."""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class SumReducer(Reducer):
|
||||
"""A reducer that sums the inputs."""
|
||||
|
||||
def reduce(self, inputs):
|
||||
if len(inputs) == 1:
|
||||
return inputs[0]
|
||||
if len(inputs) == 2:
|
||||
return inputs[0] + inputs[1]
|
||||
return tf.add_n(inputs)
|
||||
|
||||
def reduce_sequence(self, inputs, sequence_lengths):
|
||||
padded, combined_length = pad_n_with_identity(
|
||||
inputs, sequence_lengths, identity_values=0)
|
||||
return self.reduce(padded), combined_length
|
||||
|
||||
|
||||
class MultiplyReducer(Reducer):
|
||||
"""A reducer that multiplies the inputs."""
|
||||
|
||||
def reduce(self, inputs):
|
||||
return functools.reduce(lambda a, x: a * x, inputs)
|
||||
|
||||
def reduce_sequence(self, inputs, sequence_lengths):
|
||||
padded, combined_length = pad_n_with_identity(
|
||||
inputs, sequence_lengths, identity_values=1)
|
||||
return self.reduce(padded), combined_length
|
||||
240
modelscope/models/audio/tts/am/models/rnn_wrappers.py
Executable file
240
modelscope/models/audio/tts/am/models/rnn_wrappers.py
Executable file
@@ -0,0 +1,240 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.rnn import RNNCell
|
||||
from tensorflow.contrib.seq2seq import AttentionWrapperState
|
||||
from tensorflow.python.ops import rnn_cell_impl
|
||||
|
||||
from .modules import prenet
|
||||
|
||||
|
||||
class VarPredictorCell(RNNCell):
|
||||
'''Wrapper wrapper knock knock.'''
|
||||
|
||||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
|
||||
super(VarPredictorCell, self).__init__()
|
||||
self._var_predictor_cell = var_predictor_cell
|
||||
self._is_training = is_training
|
||||
self._dim = dim
|
||||
self._prenet_units = prenet_units
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
return tuple([self.output_size, self._var_predictor_cell.state_size])
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return self._dim
|
||||
|
||||
def zero_state(self, batch_size, dtype):
|
||||
return tuple([
|
||||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
|
||||
dtype),
|
||||
self._var_predictor_cell.zero_state(batch_size, dtype)
|
||||
])
|
||||
|
||||
def call(self, inputs, state):
|
||||
'''Run the Tacotron2 super decoder cell.'''
|
||||
super_cell_out, decoder_state = state
|
||||
|
||||
# split
|
||||
prenet_input = inputs[:, 0:self._dim]
|
||||
encoder_output = inputs[:, self._dim:]
|
||||
|
||||
# prenet and concat
|
||||
prenet_output = prenet(
|
||||
prenet_input,
|
||||
self._prenet_units,
|
||||
self._is_training,
|
||||
scope='var_prenet')
|
||||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
|
||||
|
||||
# decoder LSTM/GRU
|
||||
new_super_cell_out, new_decoder_state = self._var_predictor_cell(
|
||||
decoder_input, decoder_state)
|
||||
|
||||
# projection
|
||||
new_super_cell_out = tf.layers.dense(
|
||||
new_super_cell_out, units=self._dim)
|
||||
|
||||
new_states = tuple([new_super_cell_out, new_decoder_state])
|
||||
|
||||
return new_super_cell_out, new_states
|
||||
|
||||
|
||||
class DurPredictorCell(RNNCell):
|
||||
'''Wrapper wrapper knock knock.'''
|
||||
|
||||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
|
||||
super(DurPredictorCell, self).__init__()
|
||||
self._var_predictor_cell = var_predictor_cell
|
||||
self._is_training = is_training
|
||||
self._dim = dim
|
||||
self._prenet_units = prenet_units
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
return tuple([self.output_size, self._var_predictor_cell.state_size])
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return self._dim
|
||||
|
||||
def zero_state(self, batch_size, dtype):
|
||||
return tuple([
|
||||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
|
||||
dtype),
|
||||
self._var_predictor_cell.zero_state(batch_size, dtype)
|
||||
])
|
||||
|
||||
def call(self, inputs, state):
|
||||
'''Run the Tacotron2 super decoder cell.'''
|
||||
super_cell_out, decoder_state = state
|
||||
|
||||
# split
|
||||
prenet_input = inputs[:, 0:self._dim]
|
||||
encoder_output = inputs[:, self._dim:]
|
||||
|
||||
# prenet and concat
|
||||
prenet_output = prenet(
|
||||
prenet_input,
|
||||
self._prenet_units,
|
||||
self._is_training,
|
||||
scope='dur_prenet')
|
||||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
|
||||
|
||||
# decoder LSTM/GRU
|
||||
new_super_cell_out, new_decoder_state = self._var_predictor_cell(
|
||||
decoder_input, decoder_state)
|
||||
|
||||
# projection
|
||||
new_super_cell_out = tf.layers.dense(
|
||||
new_super_cell_out, units=self._dim)
|
||||
new_super_cell_out = tf.nn.relu(new_super_cell_out)
|
||||
# new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1)
|
||||
|
||||
new_states = tuple([new_super_cell_out, new_decoder_state])
|
||||
|
||||
return new_super_cell_out, new_states
|
||||
|
||||
|
||||
class DurPredictorCECell(RNNCell):
|
||||
'''Wrapper wrapper knock knock.'''
|
||||
|
||||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units,
|
||||
max_dur, dur_embedding_dim):
|
||||
super(DurPredictorCECell, self).__init__()
|
||||
self._var_predictor_cell = var_predictor_cell
|
||||
self._is_training = is_training
|
||||
self._dim = dim
|
||||
self._prenet_units = prenet_units
|
||||
self._max_dur = max_dur
|
||||
self._dur_embedding_dim = dur_embedding_dim
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
return tuple([self.output_size, self._var_predictor_cell.state_size])
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return self._max_dur
|
||||
|
||||
def zero_state(self, batch_size, dtype):
|
||||
return tuple([
|
||||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
|
||||
dtype),
|
||||
self._var_predictor_cell.zero_state(batch_size, dtype)
|
||||
])
|
||||
|
||||
def call(self, inputs, state):
|
||||
'''Run the Tacotron2 super decoder cell.'''
|
||||
super_cell_out, decoder_state = state
|
||||
|
||||
# split
|
||||
prenet_input = tf.squeeze(
|
||||
tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1) # [N]
|
||||
prenet_input = tf.one_hot(
|
||||
prenet_input, self._max_dur, on_value=1.0, off_value=0.0,
|
||||
axis=-1) # [N, 120]
|
||||
prenet_input = tf.layers.dense(
|
||||
prenet_input, units=self._dur_embedding_dim)
|
||||
encoder_output = inputs[:, self._dim:]
|
||||
|
||||
# prenet and concat
|
||||
prenet_output = prenet(
|
||||
prenet_input,
|
||||
self._prenet_units,
|
||||
self._is_training,
|
||||
scope='dur_prenet')
|
||||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
|
||||
|
||||
# decoder LSTM/GRU
|
||||
new_super_cell_out, new_decoder_state = self._var_predictor_cell(
|
||||
decoder_input, decoder_state)
|
||||
|
||||
# projection
|
||||
new_super_cell_out = tf.layers.dense(
|
||||
new_super_cell_out, units=self._max_dur) # [N, 120]
|
||||
new_super_cell_out = tf.nn.softmax(new_super_cell_out) # [N, 120]
|
||||
|
||||
new_states = tuple([new_super_cell_out, new_decoder_state])
|
||||
|
||||
return new_super_cell_out, new_states
|
||||
|
||||
|
||||
class VarPredictorCell2(RNNCell):
|
||||
'''Wrapper wrapper knock knock.'''
|
||||
|
||||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
|
||||
super(VarPredictorCell2, self).__init__()
|
||||
self._var_predictor_cell = var_predictor_cell
|
||||
self._is_training = is_training
|
||||
self._dim = dim
|
||||
self._prenet_units = prenet_units
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
return tuple([self.output_size, self._var_predictor_cell.state_size])
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return self._dim
|
||||
|
||||
def zero_state(self, batch_size, dtype):
|
||||
return tuple([
|
||||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
|
||||
dtype),
|
||||
self._var_predictor_cell.zero_state(batch_size, dtype)
|
||||
])
|
||||
|
||||
def call(self, inputs, state):
|
||||
'''Run the Tacotron2 super decoder cell.'''
|
||||
super_cell_out, decoder_state = state
|
||||
|
||||
# split
|
||||
prenet_input = inputs[:, 0:self._dim]
|
||||
encoder_output = inputs[:, self._dim:]
|
||||
|
||||
# prenet and concat
|
||||
prenet_output = prenet(
|
||||
prenet_input,
|
||||
self._prenet_units,
|
||||
self._is_training,
|
||||
scope='var_prenet')
|
||||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)
|
||||
|
||||
# decoder LSTM/GRU
|
||||
new_super_cell_out, new_decoder_state = self._var_predictor_cell(
|
||||
decoder_input, decoder_state)
|
||||
|
||||
# projection
|
||||
new_super_cell_out = tf.layers.dense(
|
||||
new_super_cell_out, units=self._dim)
|
||||
|
||||
# split and relu
|
||||
new_super_cell_out = tf.concat([
|
||||
tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:]
|
||||
], axis=-1) # yapf:disable
|
||||
|
||||
new_states = tuple([new_super_cell_out, new_decoder_state])
|
||||
|
||||
return new_super_cell_out, new_states
|
||||
760
modelscope/models/audio/tts/am/models/robutrans.py
Executable file
760
modelscope/models/audio/tts/am/models/robutrans.py
Executable file
@@ -0,0 +1,760 @@
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell
|
||||
from tensorflow.contrib.seq2seq import BasicDecoder
|
||||
from tensorflow.python.ops.ragged.ragged_util import repeat
|
||||
|
||||
from .fsmn_encoder import FsmnEncoderV2
|
||||
from .helpers import VarTestHelper, VarTrainingHelper
|
||||
from .modules import conv_prenet, decoder_prenet, encoder_prenet
|
||||
from .position import (BatchSinusodalPositionalEncoding,
|
||||
SinusodalPositionalEncoding)
|
||||
from .rnn_wrappers import DurPredictorCell, VarPredictorCell
|
||||
from .self_attention_decoder import SelfAttentionDecoder
|
||||
from .self_attention_encoder import SelfAttentionEncoder
|
||||
|
||||
|
||||
class RobuTrans():
|
||||
|
||||
def __init__(self, hparams):
|
||||
self._hparams = hparams
|
||||
|
||||
def initialize(self,
|
||||
inputs,
|
||||
inputs_emotion,
|
||||
inputs_speaker,
|
||||
input_lengths,
|
||||
output_lengths=None,
|
||||
mel_targets=None,
|
||||
durations=None,
|
||||
pitch_contours=None,
|
||||
uv_masks=None,
|
||||
pitch_scales=None,
|
||||
duration_scales=None,
|
||||
energy_contours=None,
|
||||
energy_scales=None):
|
||||
'''Initializes the model for inference.
|
||||
|
||||
Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields.
|
||||
|
||||
Args:
|
||||
inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
|
||||
steps in the input time series, and values are character IDs
|
||||
input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
|
||||
of each sequence in inputs.
|
||||
output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
|
||||
of each sequence in outputs.
|
||||
mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
|
||||
of steps in the output time series, M is num_mels, and values are entries in the mel
|
||||
spectrogram. Only needed for training.
|
||||
'''
|
||||
with tf.variable_scope('inference') as _:
|
||||
is_training = mel_targets is not None
|
||||
batch_size = tf.shape(inputs)[0]
|
||||
hp = self._hparams
|
||||
|
||||
input_mask = None
|
||||
if input_lengths is not None and is_training:
|
||||
input_mask = tf.sequence_mask(
|
||||
input_lengths, tf.shape(inputs)[1], dtype=tf.float32)
|
||||
|
||||
if input_mask is not None:
|
||||
inputs = inputs * tf.expand_dims(input_mask, -1)
|
||||
|
||||
# speaker embedding
|
||||
embedded_inputs_speaker = tf.layers.dense(
|
||||
inputs_speaker,
|
||||
32,
|
||||
activation=None,
|
||||
use_bias=False,
|
||||
kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
|
||||
|
||||
# emotion embedding
|
||||
embedded_inputs_emotion = tf.layers.dense(
|
||||
inputs_emotion,
|
||||
32,
|
||||
activation=None,
|
||||
use_bias=False,
|
||||
kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))
|
||||
|
||||
# symbol embedding
|
||||
with tf.variable_scope('Embedding'):
|
||||
embedded_inputs = tf.layers.dense(
|
||||
inputs,
|
||||
hp.embedding_dim,
|
||||
activation=None,
|
||||
use_bias=False,
|
||||
kernel_initializer=tf.truncated_normal_initializer(
|
||||
stddev=0.5))
|
||||
|
||||
# Encoder
|
||||
with tf.variable_scope('Encoder'):
|
||||
Encoder = SelfAttentionEncoder(
|
||||
num_layers=hp.encoder_num_layers,
|
||||
num_units=hp.encoder_num_units,
|
||||
num_heads=hp.encoder_num_heads,
|
||||
ffn_inner_dim=hp.encoder_ffn_inner_dim,
|
||||
dropout=hp.encoder_dropout,
|
||||
attention_dropout=hp.encoder_attention_dropout,
|
||||
relu_dropout=hp.encoder_relu_dropout)
|
||||
encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode(
|
||||
embedded_inputs,
|
||||
sequence_length=input_lengths,
|
||||
mode=is_training)
|
||||
encoder_outputs = tf.layers.dense(
|
||||
encoder_outputs,
|
||||
hp.encoder_projection_units,
|
||||
activation=None,
|
||||
use_bias=False,
|
||||
kernel_initializer=tf.truncated_normal_initializer(
|
||||
stddev=0.5))
|
||||
|
||||
# pitch and energy
|
||||
var_inputs = tf.concat([
|
||||
encoder_outputs, embedded_inputs_speaker,
|
||||
embedded_inputs_emotion
|
||||
], 2)
|
||||
if input_mask is not None:
|
||||
var_inputs = var_inputs * tf.expand_dims(input_mask, -1)
|
||||
|
||||
with tf.variable_scope('Pitch_Predictor'):
|
||||
Pitch_Predictor_FSMN = FsmnEncoderV2(
|
||||
filter_size=hp.predictor_filter_size,
|
||||
fsmn_num_layers=hp.predictor_fsmn_num_layers,
|
||||
dnn_num_layers=hp.predictor_dnn_num_layers,
|
||||
num_memory_units=hp.predictor_num_memory_units,
|
||||
ffn_inner_dim=hp.predictor_ffn_inner_dim,
|
||||
dropout=hp.predictor_dropout,
|
||||
shift=hp.predictor_shift,
|
||||
position_encoder=None)
|
||||
pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode(
|
||||
tf.concat([
|
||||
encoder_outputs, embedded_inputs_speaker,
|
||||
embedded_inputs_emotion
|
||||
], 2),
|
||||
sequence_length=input_lengths,
|
||||
mode=is_training)
|
||||
pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
|
||||
LSTMBlockCell(hp.predictor_lstm_units),
|
||||
LSTMBlockCell(hp.predictor_lstm_units),
|
||||
pitch_contour_outputs,
|
||||
sequence_length=input_lengths,
|
||||
dtype=tf.float32)
|
||||
pitch_contour_outputs = tf.concat(
|
||||
pitch_contour_outputs, axis=-1)
|
||||
pitch_contour_outputs = tf.layers.dense(
|
||||
pitch_contour_outputs, units=1) # [N, T_in, 1]
|
||||
pitch_contour_outputs = tf.squeeze(
|
||||
pitch_contour_outputs, axis=2) # [N, T_in]
|
||||
|
||||
with tf.variable_scope('Energy_Predictor'):
|
||||
Energy_Predictor_FSMN = FsmnEncoderV2(
|
||||
filter_size=hp.predictor_filter_size,
|
||||
fsmn_num_layers=hp.predictor_fsmn_num_layers,
|
||||
dnn_num_layers=hp.predictor_dnn_num_layers,
|
||||
num_memory_units=hp.predictor_num_memory_units,
|
||||
ffn_inner_dim=hp.predictor_ffn_inner_dim,
|
||||
dropout=hp.predictor_dropout,
|
||||
shift=hp.predictor_shift,
|
||||
position_encoder=None)
|
||||
energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode(
|
||||
tf.concat([
|
||||
encoder_outputs, embedded_inputs_speaker,
|
||||
embedded_inputs_emotion
|
||||
], 2),
|
||||
sequence_length=input_lengths,
|
||||
mode=is_training)
|
||||
energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
|
||||
LSTMBlockCell(hp.predictor_lstm_units),
|
||||
LSTMBlockCell(hp.predictor_lstm_units),
|
||||
energy_contour_outputs,
|
||||
sequence_length=input_lengths,
|
||||
dtype=tf.float32)
|
||||
energy_contour_outputs = tf.concat(
|
||||
energy_contour_outputs, axis=-1)
|
||||
energy_contour_outputs = tf.layers.dense(
|
||||
energy_contour_outputs, units=1) # [N, T_in, 1]
|
||||
energy_contour_outputs = tf.squeeze(
|
||||
energy_contour_outputs, axis=2) # [N, T_in]
|
||||
|
||||
if is_training:
|
||||
pitch_embeddings = tf.expand_dims(
|
||||
pitch_contours, axis=2) # [N, T_in, 1]
|
||||
pitch_embeddings = tf.layers.conv1d(
|
||||
pitch_embeddings,
|
||||
filters=hp.encoder_projection_units,
|
||||
kernel_size=9,
|
||||
padding='same',
|
||||
name='pitch_embeddings') # [N, T_in, 32]
|
||||
|
||||
energy_embeddings = tf.expand_dims(
|
||||
energy_contours, axis=2) # [N, T_in, 1]
|
||||
energy_embeddings = tf.layers.conv1d(
|
||||
energy_embeddings,
|
||||
filters=hp.encoder_projection_units,
|
||||
kernel_size=9,
|
||||
padding='same',
|
||||
name='energy_embeddings') # [N, T_in, 32]
|
||||
else:
|
||||
pitch_contour_outputs *= pitch_scales
|
||||
pitch_embeddings = tf.expand_dims(
|
||||
pitch_contour_outputs, axis=2) # [N, T_in, 1]
|
||||
pitch_embeddings = tf.layers.conv1d(
|
||||
pitch_embeddings,
|
||||
filters=hp.encoder_projection_units,
|
||||
kernel_size=9,
|
||||
padding='same',
|
||||
name='pitch_embeddings') # [N, T_in, 32]
|
||||
|
||||
energy_contour_outputs *= energy_scales
|
||||
energy_embeddings = tf.expand_dims(
|
||||
energy_contour_outputs, axis=2) # [N, T_in, 1]
|
||||
energy_embeddings = tf.layers.conv1d(
|
||||
energy_embeddings,
|
||||
filters=hp.encoder_projection_units,
|
||||
kernel_size=9,
|
||||
padding='same',
|
||||
name='energy_embeddings') # [N, T_in, 32]
|
||||
|
||||
encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings
|
||||
|
||||
# duration
|
||||
dur_inputs = tf.concat([
|
||||
encoder_outputs_, embedded_inputs_speaker,
|
||||
embedded_inputs_emotion
|
||||
], 2)
|
||||
if input_mask is not None:
|
||||
dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1)
|
||||
with tf.variable_scope('Duration_Predictor'):
|
||||
duration_predictor_cell = MultiRNNCell([
|
||||
LSTMBlockCell(hp.predictor_lstm_units),
|
||||
LSTMBlockCell(hp.predictor_lstm_units)
|
||||
], state_is_tuple=True) # yapf:disable
|
||||
duration_output_cell = DurPredictorCell(
|
||||
duration_predictor_cell, is_training, 1,
|
||||
hp.predictor_prenet_units)
|
||||
duration_predictor_init_state = duration_output_cell.zero_state(
|
||||
batch_size=batch_size, dtype=tf.float32)
|
||||
if is_training:
|
||||
duration_helper = VarTrainingHelper(
|
||||
tf.expand_dims(
|
||||
tf.log(tf.cast(durations, tf.float32) + 1),
|
||||
axis=2), dur_inputs, 1)
|
||||
else:
|
||||
duration_helper = VarTestHelper(batch_size, dur_inputs, 1)
|
||||
(
|
||||
duration_outputs, _
|
||||
), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode(
|
||||
BasicDecoder(duration_output_cell, duration_helper,
|
||||
duration_predictor_init_state),
|
||||
maximum_iterations=1000)
|
||||
duration_outputs = tf.squeeze(
|
||||
duration_outputs, axis=2) # [N, T_in]
|
||||
if input_mask is not None:
|
||||
duration_outputs = duration_outputs * input_mask
|
||||
duration_outputs_ = tf.exp(duration_outputs) - 1
|
||||
|
||||
# Length Regulator
|
||||
with tf.variable_scope('Length_Regulator'):
|
||||
if is_training:
|
||||
i = tf.constant(1)
|
||||
# position embedding
|
||||
j = tf.constant(1)
|
||||
dur_len = tf.shape(durations)[-1]
|
||||
embedded_position_i = tf.range(1, durations[0, 0] + 1)
|
||||
|
||||
def condition_pos(j, e):
|
||||
return tf.less(j, dur_len)
|
||||
|
||||
def loop_body_pos(j, embedded_position_i):
|
||||
embedded_position_i = tf.concat([
|
||||
embedded_position_i,
|
||||
tf.range(1, durations[0, j] + 1)
|
||||
], axis=0) # yapf:disable
|
||||
return [j + 1, embedded_position_i]
|
||||
|
||||
j, embedded_position_i = tf.while_loop(
|
||||
condition_pos,
|
||||
loop_body_pos, [j, embedded_position_i],
|
||||
shape_invariants=[
|
||||
j.get_shape(),
|
||||
tf.TensorShape([None])
|
||||
])
|
||||
embedded_position = tf.reshape(embedded_position_i,
|
||||
(1, -1))
|
||||
|
||||
# others
|
||||
LR_outputs = repeat(
|
||||
encoder_outputs_[0:1, :, :], durations[0, :], axis=1)
|
||||
embedded_outputs_speaker = repeat(
|
||||
embedded_inputs_speaker[0:1, :, :],
|
||||
durations[0, :],
|
||||
axis=1)
|
||||
embedded_outputs_emotion = repeat(
|
||||
embedded_inputs_emotion[0:1, :, :],
|
||||
durations[0, :],
|
||||
axis=1)
|
||||
|
||||
def condition(i, pos, layer, s, e):
|
||||
return tf.less(i, tf.shape(mel_targets)[0])
|
||||
|
||||
def loop_body(i, embedded_position, LR_outputs,
|
||||
embedded_outputs_speaker,
|
||||
embedded_outputs_emotion):
|
||||
# position embedding
|
||||
jj = tf.constant(1)
|
||||
embedded_position_i = tf.range(1, durations[i, 0] + 1)
|
||||
|
||||
def condition_pos_i(j, e):
|
||||
return tf.less(j, dur_len)
|
||||
|
||||
def loop_body_pos_i(j, embedded_position_i):
|
||||
embedded_position_i = tf.concat([
|
||||
embedded_position_i,
|
||||
tf.range(1, durations[i, j] + 1)
|
||||
], axis=0) # yapf:disable
|
||||
return [j + 1, embedded_position_i]
|
||||
|
||||
jj, embedded_position_i = tf.while_loop(
|
||||
condition_pos_i,
|
||||
loop_body_pos_i, [jj, embedded_position_i],
|
||||
shape_invariants=[
|
||||
jj.get_shape(),
|
||||
tf.TensorShape([None])
|
||||
])
|
||||
embedded_position = tf.concat([
|
||||
embedded_position,
|
||||
tf.reshape(embedded_position_i, (1, -1))
|
||||
], 0)
|
||||
|
||||
# others
|
||||
LR_outputs = tf.concat([
|
||||
LR_outputs,
|
||||
repeat(
|
||||
encoder_outputs_[i:i + 1, :, :],
|
||||
durations[i, :],
|
||||
axis=1)
|
||||
], 0)
|
||||
embedded_outputs_speaker = tf.concat([
|
||||
embedded_outputs_speaker,
|
||||
repeat(
|
||||
embedded_inputs_speaker[i:i + 1, :, :],
|
||||
durations[i, :],
|
||||
axis=1)
|
||||
], 0)
|
||||
embedded_outputs_emotion = tf.concat([
|
||||
embedded_outputs_emotion,
|
||||
repeat(
|
||||
embedded_inputs_emotion[i:i + 1, :, :],
|
||||
durations[i, :],
|
||||
axis=1)
|
||||
], 0)
|
||||
return [
|
||||
i + 1, embedded_position, LR_outputs,
|
||||
embedded_outputs_speaker, embedded_outputs_emotion
|
||||
]
|
||||
|
||||
i, embedded_position, LR_outputs,
|
||||
embedded_outputs_speaker,
|
||||
embedded_outputs_emotion = tf.while_loop(
|
||||
condition,
|
||||
loop_body, [
|
||||
i, embedded_position, LR_outputs,
|
||||
embedded_outputs_speaker, embedded_outputs_emotion
|
||||
],
|
||||
shape_invariants=[
|
||||
i.get_shape(),
|
||||
tf.TensorShape([None, None]),
|
||||
tf.TensorShape([None, None, None]),
|
||||
tf.TensorShape([None, None, None]),
|
||||
tf.TensorShape([None, None, None])
|
||||
],
|
||||
parallel_iterations=hp.batch_size)
|
||||
|
||||
ori_framenum = tf.shape(mel_targets)[1]
|
||||
else:
|
||||
# position
|
||||
j = tf.constant(1)
|
||||
dur_len = tf.shape(duration_outputs_)[-1]
|
||||
embedded_position_i = tf.range(
|
||||
1,
|
||||
tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32)
|
||||
+ 1)
|
||||
|
||||
def condition_pos(j, e):
|
||||
return tf.less(j, dur_len)
|
||||
|
||||
def loop_body_pos(j, embedded_position_i):
|
||||
embedded_position_i = tf.concat([
|
||||
embedded_position_i,
|
||||
tf.range(
|
||||
1,
|
||||
tf.cast(
|
||||
tf.round(duration_outputs_)[0, j],
|
||||
tf.int32) + 1)
|
||||
], axis=0) # yapf:disable
|
||||
return [j + 1, embedded_position_i]
|
||||
|
||||
j, embedded_position_i = tf.while_loop(
|
||||
condition_pos,
|
||||
loop_body_pos, [j, embedded_position_i],
|
||||
shape_invariants=[
|
||||
j.get_shape(),
|
||||
tf.TensorShape([None])
|
||||
])
|
||||
embedded_position = tf.reshape(embedded_position_i,
|
||||
(1, -1))
|
||||
# others
|
||||
duration_outputs_ *= duration_scales
|
||||
LR_outputs = repeat(
|
||||
encoder_outputs_[0:1, :, :],
|
||||
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
|
||||
axis=1)
|
||||
embedded_outputs_speaker = repeat(
|
||||
embedded_inputs_speaker[0:1, :, :],
|
||||
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
|
||||
axis=1)
|
||||
embedded_outputs_emotion = repeat(
|
||||
embedded_inputs_emotion[0:1, :, :],
|
||||
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
|
||||
axis=1)
|
||||
ori_framenum = tf.shape(LR_outputs)[1]
|
||||
|
||||
left = hp.outputs_per_step - tf.mod(
|
||||
ori_framenum, hp.outputs_per_step)
|
||||
LR_outputs = tf.cond(
|
||||
tf.equal(left,
|
||||
hp.outputs_per_step), lambda: LR_outputs,
|
||||
lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]],
|
||||
'CONSTANT'))
|
||||
embedded_outputs_speaker = tf.cond(
|
||||
tf.equal(left, hp.outputs_per_step),
|
||||
lambda: embedded_outputs_speaker, lambda: tf.pad(
|
||||
embedded_outputs_speaker, [[0, 0], [0, left],
|
||||
[0, 0]], 'CONSTANT'))
|
||||
embedded_outputs_emotion = tf.cond(
|
||||
tf.equal(left, hp.outputs_per_step),
|
||||
lambda: embedded_outputs_emotion, lambda: tf.pad(
|
||||
embedded_outputs_emotion, [[0, 0], [0, left],
|
||||
[0, 0]], 'CONSTANT'))
|
||||
embedded_position = tf.cond(
|
||||
tf.equal(left, hp.outputs_per_step),
|
||||
lambda: embedded_position,
|
||||
lambda: tf.pad(embedded_position, [[0, 0], [0, left]],
|
||||
'CONSTANT'))
|
||||
|
||||
# Pos_Embedding
|
||||
with tf.variable_scope('Position_Embedding'):
|
||||
Pos_Embedding = BatchSinusodalPositionalEncoding()
|
||||
position_embeddings = Pos_Embedding.positional_encoding(
|
||||
batch_size,
|
||||
tf.shape(LR_outputs)[1], hp.encoder_projection_units,
|
||||
embedded_position)
|
||||
LR_outputs += position_embeddings
|
||||
|
||||
# multi-frame
|
||||
LR_outputs = tf.reshape(LR_outputs, [
|
||||
batch_size, -1,
|
||||
hp.outputs_per_step * hp.encoder_projection_units
|
||||
])
|
||||
embedded_outputs_speaker = tf.reshape(
|
||||
embedded_outputs_speaker,
|
||||
[batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
|
||||
embedded_outputs_emotion = tf.reshape(
|
||||
embedded_outputs_emotion,
|
||||
[batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
|
||||
# [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64)
|
||||
LR_outputs = tf.concat([
|
||||
LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion
|
||||
], -1)
|
||||
|
||||
# auto bandwidth
|
||||
if is_training:
|
||||
durations_mask = tf.cast(durations,
|
||||
tf.float32) * input_mask # [N, T_in]
|
||||
else:
|
||||
durations_mask = duration_outputs_
|
||||
X_band_width = tf.cast(
|
||||
tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step),
|
||||
tf.int32)
|
||||
H_band_width = X_band_width
|
||||
|
||||
with tf.variable_scope('Decoder'):
|
||||
Decoder = SelfAttentionDecoder(
|
||||
num_layers=hp.decoder_num_layers,
|
||||
num_units=hp.decoder_num_units,
|
||||
num_heads=hp.decoder_num_heads,
|
||||
ffn_inner_dim=hp.decoder_ffn_inner_dim,
|
||||
dropout=hp.decoder_dropout,
|
||||
attention_dropout=hp.decoder_attention_dropout,
|
||||
relu_dropout=hp.decoder_relu_dropout,
|
||||
prenet_units=hp.prenet_units,
|
||||
dense_units=hp.prenet_proj_units,
|
||||
num_mels=hp.num_mels,
|
||||
outputs_per_step=hp.outputs_per_step,
|
||||
X_band_width=X_band_width,
|
||||
H_band_width=H_band_width,
|
||||
position_encoder=None)
|
||||
if is_training:
|
||||
if hp.free_run:
|
||||
r = hp.outputs_per_step
|
||||
init_decoder_input = tf.expand_dims(
|
||||
tf.tile([[0.0]], [batch_size, hp.num_mels]),
|
||||
axis=1) # [N, 1, hp.num_mels]
|
||||
decoder_input_lengths = tf.cast(
|
||||
output_lengths / r, tf.int32)
|
||||
decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
|
||||
init_decoder_input,
|
||||
maximum_iterations=tf.shape(LR_outputs)[1],
|
||||
mode=is_training,
|
||||
memory=LR_outputs,
|
||||
memory_sequence_length=decoder_input_lengths)
|
||||
else:
|
||||
r = hp.outputs_per_step
|
||||
decoder_input = mel_targets[:, r - 1::
|
||||
r, :] # [N, T_out / r, hp.num_mels]
|
||||
init_decoder_input = tf.expand_dims(
|
||||
tf.tile([[0.0]], [batch_size, hp.num_mels]),
|
||||
axis=1) # [N, 1, hp.num_mels]
|
||||
decoder_input = tf.concat(
|
||||
[init_decoder_input, decoder_input],
|
||||
axis=1) # [N, T_out / r + 1, hp.num_mels]
|
||||
decoder_input = decoder_input[:, :
|
||||
-1, :] # [N, T_out / r, hp.num_mels]
|
||||
decoder_input_lengths = tf.cast(
|
||||
output_lengths / r, tf.int32)
|
||||
decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs(
|
||||
decoder_input,
|
||||
decoder_input_lengths,
|
||||
mode=is_training,
|
||||
memory=LR_outputs,
|
||||
memory_sequence_length=decoder_input_lengths)
|
||||
else:
|
||||
init_decoder_input = tf.expand_dims(
|
||||
tf.tile([[0.0]], [batch_size, hp.num_mels]),
|
||||
axis=1) # [N, 1, hp.num_mels]
|
||||
decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
|
||||
init_decoder_input,
|
||||
maximum_iterations=tf.shape(LR_outputs)[1],
|
||||
mode=is_training,
|
||||
memory=LR_outputs,
|
||||
memory_sequence_length=tf.expand_dims(
|
||||
tf.shape(LR_outputs)[1], axis=0))
|
||||
|
||||
if is_training:
|
||||
mel_outputs_ = tf.reshape(decoder_outputs,
|
||||
[batch_size, -1, hp.num_mels])
|
||||
else:
|
||||
mel_outputs_ = tf.reshape(
|
||||
decoder_outputs,
|
||||
[batch_size, -1, hp.num_mels])[:, :ori_framenum, :]
|
||||
mel_outputs = mel_outputs_
|
||||
|
||||
with tf.variable_scope('Postnet'):
|
||||
Postnet_FSMN = FsmnEncoderV2(
|
||||
filter_size=hp.postnet_filter_size,
|
||||
fsmn_num_layers=hp.postnet_fsmn_num_layers,
|
||||
dnn_num_layers=hp.postnet_dnn_num_layers,
|
||||
num_memory_units=hp.postnet_num_memory_units,
|
||||
ffn_inner_dim=hp.postnet_ffn_inner_dim,
|
||||
dropout=hp.postnet_dropout,
|
||||
shift=hp.postnet_shift,
|
||||
position_encoder=None)
|
||||
if is_training:
|
||||
postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
|
||||
mel_outputs,
|
||||
sequence_length=output_lengths,
|
||||
mode=is_training)
|
||||
hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
|
||||
LSTMBlockCell(hp.postnet_lstm_units),
|
||||
postnet_fsmn_outputs,
|
||||
sequence_length=output_lengths,
|
||||
dtype=tf.float32)
|
||||
else:
|
||||
postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
|
||||
mel_outputs,
|
||||
sequence_length=[tf.shape(mel_outputs_)[1]],
|
||||
mode=is_training)
|
||||
hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
|
||||
LSTMBlockCell(hp.postnet_lstm_units),
|
||||
postnet_fsmn_outputs,
|
||||
sequence_length=[tf.shape(mel_outputs_)[1]],
|
||||
dtype=tf.float32)
|
||||
|
||||
mel_residual_outputs = tf.layers.dense(
|
||||
hidden_lstm_outputs, units=hp.num_mels)
|
||||
mel_outputs += mel_residual_outputs
|
||||
|
||||
self.inputs = inputs
|
||||
self.inputs_speaker = inputs_speaker
|
||||
self.inputs_emotion = inputs_emotion
|
||||
self.input_lengths = input_lengths
|
||||
self.durations = durations
|
||||
self.output_lengths = output_lengths
|
||||
self.mel_outputs_ = mel_outputs_
|
||||
self.mel_outputs = mel_outputs
|
||||
self.mel_targets = mel_targets
|
||||
self.duration_outputs = duration_outputs
|
||||
self.duration_outputs_ = duration_outputs_
|
||||
self.duration_scales = duration_scales
|
||||
self.pitch_contour_outputs = pitch_contour_outputs
|
||||
self.pitch_contours = pitch_contours
|
||||
self.pitch_scales = pitch_scales
|
||||
self.energy_contour_outputs = energy_contour_outputs
|
||||
self.energy_contours = energy_contours
|
||||
self.energy_scales = energy_scales
|
||||
self.uv_masks_ = uv_masks
|
||||
|
||||
self.embedded_inputs_emotion = embedded_inputs_emotion
|
||||
self.embedding_fsmn_outputs = embedded_inputs
|
||||
self.encoder_outputs = encoder_outputs
|
||||
self.encoder_outputs_ = encoder_outputs_
|
||||
self.LR_outputs = LR_outputs
|
||||
self.postnet_fsmn_outputs = postnet_fsmn_outputs
|
||||
|
||||
self.pitch_embeddings = pitch_embeddings
|
||||
self.energy_embeddings = energy_embeddings
|
||||
|
||||
self.attns = attns
|
||||
self.attention_x = attention_x
|
||||
self.attention_h = attention_h
|
||||
self.X_band_width = X_band_width
|
||||
self.H_band_width = H_band_width
|
||||
|
||||
def add_loss(self):
|
||||
'''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
|
||||
with tf.variable_scope('loss') as _:
|
||||
hp = self._hparams
|
||||
mask = tf.sequence_mask(
|
||||
self.output_lengths,
|
||||
tf.shape(self.mel_targets)[1],
|
||||
dtype=tf.float32)
|
||||
valid_outputs = tf.reduce_sum(mask)
|
||||
|
||||
mask_input = tf.sequence_mask(
|
||||
self.input_lengths,
|
||||
tf.shape(self.durations)[1],
|
||||
dtype=tf.float32)
|
||||
valid_inputs = tf.reduce_sum(mask_input)
|
||||
|
||||
# mel loss
|
||||
if self.uv_masks_ is not None:
|
||||
valid_outputs_mask = tf.reduce_sum(
|
||||
tf.expand_dims(mask, -1) * self.uv_masks_)
|
||||
self.mel_loss_ = tf.reduce_sum(
|
||||
tf.abs(self.mel_targets - self.mel_outputs_)
|
||||
* tf.expand_dims(mask, -1) * self.uv_masks_) / (
|
||||
valid_outputs_mask * hp.num_mels)
|
||||
self.mel_loss = tf.reduce_sum(
|
||||
tf.abs(self.mel_targets - self.mel_outputs)
|
||||
* tf.expand_dims(mask, -1) * self.uv_masks_) / (
|
||||
valid_outputs_mask * hp.num_mels)
|
||||
else:
|
||||
self.mel_loss_ = tf.reduce_sum(
|
||||
tf.abs(self.mel_targets - self.mel_outputs_)
|
||||
* tf.expand_dims(mask, -1)) / (
|
||||
valid_outputs * hp.num_mels)
|
||||
self.mel_loss = tf.reduce_sum(
|
||||
tf.abs(self.mel_targets - self.mel_outputs)
|
||||
* tf.expand_dims(mask, -1)) / (
|
||||
valid_outputs * hp.num_mels)
|
||||
|
||||
# duration loss
|
||||
self.duration_loss = tf.reduce_sum(
|
||||
tf.abs(
|
||||
tf.log(tf.cast(self.durations, tf.float32) + 1)
|
||||
- self.duration_outputs) * mask_input) / valid_inputs
|
||||
|
||||
# pitch contour loss
|
||||
self.pitch_contour_loss = tf.reduce_sum(
|
||||
tf.abs(self.pitch_contours - self.pitch_contour_outputs)
|
||||
* mask_input) / valid_inputs
|
||||
|
||||
# energy contour loss
|
||||
self.energy_contour_loss = tf.reduce_sum(
|
||||
tf.abs(self.energy_contours - self.energy_contour_outputs)
|
||||
* mask_input) / valid_inputs
|
||||
|
||||
# final loss
|
||||
self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \
|
||||
+ self.pitch_contour_loss + self.energy_contour_loss
|
||||
|
||||
# guided attention loss
|
||||
self.guided_attention_loss = tf.constant(0.0)
|
||||
if hp.guided_attention:
|
||||
i0 = tf.constant(0)
|
||||
loss0 = tf.constant(0.0)
|
||||
|
||||
def c(i, _):
|
||||
return tf.less(i, tf.shape(mel_targets)[0])
|
||||
|
||||
def loop_body(i, loss):
|
||||
decoder_input_lengths = tf.cast(
|
||||
self.output_lengths / hp.outputs_per_step, tf.int32)
|
||||
input_len = decoder_input_lengths[i]
|
||||
output_len = decoder_input_lengths[i]
|
||||
input_w = tf.expand_dims(
|
||||
tf.range(tf.cast(input_len, dtype=tf.float32)),
|
||||
axis=1) / tf.cast(
|
||||
input_len, dtype=tf.float32) # [T_in, 1]
|
||||
output_w = tf.expand_dims(
|
||||
tf.range(tf.cast(output_len, dtype=tf.float32)),
|
||||
axis=0) / tf.cast(
|
||||
output_len, dtype=tf.float32) # [1, T_out]
|
||||
guided_attention_w = 1.0 - tf.exp(
|
||||
-(1 / hp.guided_attention_2g_squared)
|
||||
* tf.square(input_w - output_w)) # [T_in, T_out]
|
||||
guided_attention_w = tf.expand_dims(
|
||||
guided_attention_w, axis=0) # [1, T_in, T_out]
|
||||
# [hp.decoder_num_heads, T_in, T_out]
|
||||
guided_attention_w = tf.tile(guided_attention_w,
|
||||
[hp.decoder_num_heads, 1, 1])
|
||||
loss_i = tf.constant(0.0)
|
||||
for j in range(hp.decoder_num_layers):
|
||||
loss_i += tf.reduce_mean(
|
||||
self.attention_h[j][i, :, :input_len, :output_len]
|
||||
* guided_attention_w)
|
||||
|
||||
return [tf.add(i, 1), tf.add(loss, loss_i)]
|
||||
|
||||
_, loss = tf.while_loop(
|
||||
c,
|
||||
loop_body,
|
||||
loop_vars=[i0, loss0],
|
||||
parallel_iterations=hp.batch_size)
|
||||
self.guided_attention_loss = loss / hp.batch_size
|
||||
self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss
|
||||
|
||||
def add_optimizer(self, global_step):
|
||||
'''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
|
||||
|
||||
Args:
|
||||
global_step: int32 scalar Tensor representing current global step in training
|
||||
'''
|
||||
with tf.variable_scope('optimizer') as _:
|
||||
hp = self._hparams
|
||||
if hp.decay_learning_rate:
|
||||
self.learning_rate = _learning_rate_decay(
|
||||
hp.initial_learning_rate, global_step)
|
||||
else:
|
||||
self.learning_rate = tf.convert_to_tensor(
|
||||
hp.initial_learning_rate)
|
||||
optimizer = tf.train.AdamOptimizer(self.learning_rate,
|
||||
hp.adam_beta1, hp.adam_beta2)
|
||||
gradients, variables = zip(*optimizer.compute_gradients(self.loss))
|
||||
self.gradients = gradients
|
||||
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
|
||||
|
||||
# Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
|
||||
# https://github.com/tensorflow/tensorflow/issues/1122
|
||||
with tf.control_dependencies(
|
||||
tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
|
||||
self.optimize = optimizer.apply_gradients(
|
||||
zip(clipped_gradients, variables), global_step=global_step)
|
||||
|
||||
|
||||
def _learning_rate_decay(init_lr, global_step):
|
||||
# Noam scheme from tensor2tensor:
|
||||
warmup_steps = 4000.0
|
||||
step = tf.cast(global_step + 1, dtype=tf.float32)
|
||||
return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5,
|
||||
step**-0.5)
|
||||
817
modelscope/models/audio/tts/am/models/self_attention_decoder.py
Executable file
817
modelscope/models/audio/tts/am/models/self_attention_decoder.py
Executable file
@@ -0,0 +1,817 @@
|
||||
"""Define self-attention decoder."""
|
||||
|
||||
import sys
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from . import compat, transformer
|
||||
from .modules import decoder_prenet
|
||||
from .position import SinusoidalPositionEncoder
|
||||
|
||||
|
||||
class SelfAttentionDecoder():
|
||||
"""Decoder using self-attention as described in
|
||||
https://arxiv.org/abs/1706.03762.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_layers,
|
||||
num_units=512,
|
||||
num_heads=8,
|
||||
ffn_inner_dim=2048,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
prenet_units=256,
|
||||
dense_units=128,
|
||||
num_mels=80,
|
||||
outputs_per_step=3,
|
||||
X_band_width=None,
|
||||
H_band_width=None,
|
||||
position_encoder=SinusoidalPositionEncoder(),
|
||||
self_attention_type='scaled_dot'):
|
||||
"""Initializes the parameters of the decoder.
|
||||
|
||||
Args:
|
||||
num_layers: The number of layers.
|
||||
num_units: The number of hidden units.
|
||||
num_heads: The number of heads in the multi-head attention.
|
||||
ffn_inner_dim: The number of units of the inner linear transformation
|
||||
in the feed forward layer.
|
||||
dropout: The probability to drop units from the outputs.
|
||||
attention_dropout: The probability to drop units from the attention.
|
||||
relu_dropout: The probability to drop units from the ReLU activation in
|
||||
the feed forward layer.
|
||||
position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
|
||||
apply on inputs or ``None``.
|
||||
self_attention_type: Type of self attention, "scaled_dot" or "average" (case
|
||||
insensitive).
|
||||
|
||||
Raises:
|
||||
ValueError: if :obj:`self_attention_type` is invalid.
|
||||
"""
|
||||
super(SelfAttentionDecoder, self).__init__()
|
||||
self.num_layers = num_layers
|
||||
self.num_units = num_units
|
||||
self.num_heads = num_heads
|
||||
self.ffn_inner_dim = ffn_inner_dim
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.relu_dropout = relu_dropout
|
||||
self.position_encoder = position_encoder
|
||||
self.self_attention_type = self_attention_type.lower()
|
||||
if self.self_attention_type not in ('scaled_dot', 'average'):
|
||||
raise ValueError('invalid attention type %s'
|
||||
% self.self_attention_type)
|
||||
if self.self_attention_type == 'average':
|
||||
tf.logging.warning(
|
||||
'Support for average attention network is experimental '
|
||||
'and may change in future versions.')
|
||||
self.prenet_units = prenet_units
|
||||
self.dense_units = dense_units
|
||||
self.num_mels = num_mels
|
||||
self.outputs_per_step = outputs_per_step
|
||||
self.X_band_width = X_band_width
|
||||
self.H_band_width = H_band_width
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
"""Returns the decoder output size."""
|
||||
return self.num_units
|
||||
|
||||
@property
|
||||
def support_alignment_history(self):
|
||||
return True
|
||||
|
||||
@property
|
||||
def support_multi_source(self):
|
||||
return True
|
||||
|
||||
def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
|
||||
cache = {}
|
||||
|
||||
for layer in range(self.num_layers):
|
||||
proj_cache_shape = [
|
||||
batch_size, self.num_heads, 0, self.num_units // self.num_heads
|
||||
]
|
||||
layer_cache = {}
|
||||
layer_cache['memory'] = [{
|
||||
'memory_keys':
|
||||
tf.zeros(proj_cache_shape, dtype=dtype),
|
||||
'memory_values':
|
||||
tf.zeros(proj_cache_shape, dtype=dtype)
|
||||
} for _ in range(num_sources)]
|
||||
if self.self_attention_type == 'scaled_dot':
|
||||
layer_cache['self_keys'] = tf.zeros(
|
||||
proj_cache_shape, dtype=dtype)
|
||||
layer_cache['self_values'] = tf.zeros(
|
||||
proj_cache_shape, dtype=dtype)
|
||||
elif self.self_attention_type == 'average':
|
||||
layer_cache['prev_g'] = tf.zeros(
|
||||
[batch_size, 1, self.num_units], dtype=dtype)
|
||||
cache['layer_{}'.format(layer)] = layer_cache
|
||||
|
||||
return cache
|
||||
|
||||
def _init_attn(self, dtype=tf.float32):
|
||||
attn = []
|
||||
for layer in range(self.num_layers):
|
||||
attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True))
|
||||
return attn
|
||||
|
||||
def _self_attention_stack(self,
|
||||
inputs,
|
||||
sequence_length=None,
|
||||
mode=True,
|
||||
cache=None,
|
||||
memory=None,
|
||||
memory_sequence_length=None,
|
||||
step=None):
|
||||
|
||||
# [N, T_out, self.dense_units] or [N, 1, self.dense_units]
|
||||
prenet_outputs = decoder_prenet(inputs, self.prenet_units,
|
||||
self.dense_units, mode)
|
||||
if step is None:
|
||||
decoder_inputs = tf.concat(
|
||||
[memory, prenet_outputs],
|
||||
axis=-1) # [N, T_out, memory_size + self.dense_units]
|
||||
else:
|
||||
decoder_inputs = tf.concat(
|
||||
[memory[:, step:step + 1, :], prenet_outputs],
|
||||
axis=-1) # [N, 1, memory_size + self.dense_units]
|
||||
decoder_inputs = tf.layers.dense(
|
||||
decoder_inputs, units=self.dense_units)
|
||||
|
||||
inputs = decoder_inputs
|
||||
inputs *= self.num_units**0.5
|
||||
if self.position_encoder is not None:
|
||||
inputs = self.position_encoder(
|
||||
inputs, position=step + 1 if step is not None else None)
|
||||
|
||||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
|
||||
|
||||
decoder_mask = None
|
||||
memory_mask = None
|
||||
# last_attention = None
|
||||
|
||||
X_band_width_tmp = -1
|
||||
H_band_width_tmp = -1
|
||||
if self.X_band_width is not None:
|
||||
X_band_width_tmp = tf.cast(
|
||||
tf.cond(
|
||||
tf.less(tf.shape(memory)[1], self.X_band_width),
|
||||
lambda: -1, lambda: self.X_band_width),
|
||||
dtype=tf.int64)
|
||||
if self.H_band_width is not None:
|
||||
H_band_width_tmp = tf.cast(
|
||||
tf.cond(
|
||||
tf.less(tf.shape(memory)[1], self.H_band_width),
|
||||
lambda: -1, lambda: self.H_band_width),
|
||||
dtype=tf.int64)
|
||||
|
||||
if self.self_attention_type == 'scaled_dot':
|
||||
if sequence_length is not None:
|
||||
decoder_mask = transformer.build_future_mask(
|
||||
sequence_length,
|
||||
num_heads=self.num_heads,
|
||||
maximum_length=tf.shape(inputs)[1],
|
||||
band=X_band_width_tmp) # [N, 1, T_out, T_out]
|
||||
elif self.self_attention_type == 'average':
|
||||
if cache is None:
|
||||
if sequence_length is None:
|
||||
sequence_length = tf.fill([tf.shape(inputs)[0]],
|
||||
tf.shape(inputs)[1])
|
||||
decoder_mask = transformer.cumulative_average_mask(
|
||||
sequence_length,
|
||||
maximum_length=tf.shape(inputs)[1],
|
||||
dtype=inputs.dtype)
|
||||
|
||||
if memory is not None and not tf.contrib.framework.nest.is_sequence(
|
||||
memory):
|
||||
memory = (memory, )
|
||||
if memory_sequence_length is not None:
|
||||
if not tf.contrib.framework.nest.is_sequence(
|
||||
memory_sequence_length):
|
||||
memory_sequence_length = (memory_sequence_length, )
|
||||
if step is None:
|
||||
memory_mask = [
|
||||
transformer.build_history_mask(
|
||||
length,
|
||||
num_heads=self.num_heads,
|
||||
maximum_length=tf.shape(m)[1],
|
||||
band=H_band_width_tmp)
|
||||
for m, length in zip(memory, memory_sequence_length)
|
||||
]
|
||||
else:
|
||||
memory_mask = [
|
||||
transformer.build_history_mask(
|
||||
length,
|
||||
num_heads=self.num_heads,
|
||||
maximum_length=tf.shape(m)[1],
|
||||
band=H_band_width_tmp)[:, :, step:step + 1, :]
|
||||
for m, length in zip(memory, memory_sequence_length)
|
||||
]
|
||||
|
||||
# last_attention = None
|
||||
attns_x = []
|
||||
attns_h = []
|
||||
for layer in range(self.num_layers):
|
||||
layer_name = 'layer_{}'.format(layer)
|
||||
layer_cache = cache[layer_name] if cache is not None else None
|
||||
with tf.variable_scope(layer_name):
|
||||
if memory is not None:
|
||||
for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
|
||||
memory_cache = None
|
||||
if layer_cache is not None:
|
||||
memory_cache = layer_cache['memory'][i]
|
||||
scope_name = 'multi_head_{}'.format(i)
|
||||
if i == 0:
|
||||
scope_name = 'multi_head'
|
||||
with tf.variable_scope(scope_name):
|
||||
encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA(
|
||||
self.num_heads,
|
||||
transformer.norm(inputs),
|
||||
mem,
|
||||
mode,
|
||||
num_units=self.num_units,
|
||||
mask=decoder_mask,
|
||||
mask_h=mask,
|
||||
cache=layer_cache,
|
||||
cache_h=memory_cache,
|
||||
dropout=self.attention_dropout,
|
||||
return_attention=True,
|
||||
layer_name=layer_name,
|
||||
X_band_width=self.X_band_width)
|
||||
attns_x.append(attn_x)
|
||||
attns_h.append(attn_h)
|
||||
context = transformer.drop_and_add(
|
||||
inputs, encoded, mode, dropout=self.dropout)
|
||||
|
||||
with tf.variable_scope('ffn'):
|
||||
transformed = transformer.feed_forward_ori(
|
||||
transformer.norm(context),
|
||||
self.ffn_inner_dim,
|
||||
mode,
|
||||
dropout=self.relu_dropout)
|
||||
transformed = transformer.drop_and_add(
|
||||
context, transformed, mode, dropout=self.dropout)
|
||||
|
||||
inputs = transformed
|
||||
|
||||
outputs = transformer.norm(inputs)
|
||||
outputs = tf.layers.dense(
|
||||
outputs, units=self.num_mels * self.outputs_per_step)
|
||||
return outputs, attns_x, attns_h
|
||||
|
||||
def decode_from_inputs(self,
|
||||
inputs,
|
||||
sequence_length,
|
||||
initial_state=None,
|
||||
mode=True,
|
||||
memory=None,
|
||||
memory_sequence_length=None):
|
||||
outputs, attention_x, attention_h = self._self_attention_stack(
|
||||
inputs,
|
||||
sequence_length=sequence_length,
|
||||
mode=mode,
|
||||
memory=memory,
|
||||
memory_sequence_length=memory_sequence_length)
|
||||
return outputs, attention_x, attention_h
|
||||
|
||||
def step_fn(self,
|
||||
mode,
|
||||
batch_size,
|
||||
initial_state=None,
|
||||
memory=None,
|
||||
memory_sequence_length=None,
|
||||
dtype=tf.float32):
|
||||
if memory is None:
|
||||
num_sources = 0
|
||||
elif tf.contrib.framework.nest.is_sequence(memory):
|
||||
num_sources = len(memory)
|
||||
else:
|
||||
num_sources = 1
|
||||
cache = self._init_cache(
|
||||
batch_size, dtype=dtype, num_sources=num_sources)
|
||||
attention_x = self._init_attn(dtype=dtype)
|
||||
attention_h = self._init_attn(dtype=dtype)
|
||||
|
||||
def _fn(step, inputs, cache):
|
||||
outputs, attention_x, attention_h = self._self_attention_stack(
|
||||
inputs,
|
||||
mode=mode,
|
||||
cache=cache,
|
||||
memory=memory,
|
||||
memory_sequence_length=memory_sequence_length,
|
||||
step=step)
|
||||
attention_x_tmp = []
|
||||
for layer in range(len(attention_h)):
|
||||
attention_x_tmp_l = tf.zeros_like(attention_h[layer])
|
||||
if self.X_band_width is not None:
|
||||
pred = tf.less(step, self.X_band_width + 1)
|
||||
attention_x_tmp_l_1 = tf.cond(pred, # yapf:disable
|
||||
lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer],
|
||||
lambda: tf.concat([
|
||||
attention_x_tmp_l[:, :, :,
|
||||
:step - self.X_band_width],
|
||||
attention_x_tmp_l[:, :, :,
|
||||
step - self.X_band_width:step + 1]
|
||||
+ attention_x[layer]],
|
||||
axis=-1)) # yapf:disable
|
||||
attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
|
||||
attention_x_tmp.append(
|
||||
tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2],
|
||||
axis=-1))
|
||||
else:
|
||||
attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1]
|
||||
attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
|
||||
attention_x_tmp.append(
|
||||
tf.concat([
|
||||
attention_x_tmp_l_1 + attention_x[layer],
|
||||
attention_x_tmp_l_2
|
||||
], axis=-1)) # yapf:disable
|
||||
attention_x = attention_x_tmp
|
||||
return outputs, cache, attention_x, attention_h
|
||||
|
||||
return _fn, cache, attention_x, attention_h
|
||||
|
||||
def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations,
|
||||
mode, memory, memory_sequence_length):
|
||||
batch_size = tf.shape(init_decoder_input)[0]
|
||||
step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
|
||||
mode,
|
||||
batch_size,
|
||||
memory=memory,
|
||||
memory_sequence_length=memory_sequence_length)
|
||||
|
||||
outputs, attention_x, attention_h, cache = self.dynamic_decode(
|
||||
step_fn,
|
||||
init_decoder_input,
|
||||
init_cache=init_cache,
|
||||
init_attn_x=init_attn_x,
|
||||
init_attn_h=init_attn_h,
|
||||
maximum_iterations=maximum_iterations,
|
||||
batch_size=batch_size)
|
||||
return outputs, attention_x, attention_h
|
||||
|
||||
def dynamic_decode_and_search_teacher_forcing(self, decoder_input,
|
||||
maximum_iterations, mode,
|
||||
memory,
|
||||
memory_sequence_length):
|
||||
batch_size = tf.shape(decoder_input)[0]
|
||||
step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
|
||||
mode,
|
||||
batch_size,
|
||||
memory=memory,
|
||||
memory_sequence_length=memory_sequence_length)
|
||||
|
||||
outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing(
|
||||
step_fn,
|
||||
decoder_input,
|
||||
init_cache=init_cache,
|
||||
init_attn_x=init_attn_x,
|
||||
init_attn_h=init_attn_h,
|
||||
maximum_iterations=maximum_iterations,
|
||||
batch_size=batch_size)
|
||||
return outputs, attention_x, attention_h
|
||||
|
||||
def dynamic_decode(self,
|
||||
step_fn,
|
||||
init_decoder_input,
|
||||
init_cache=None,
|
||||
init_attn_x=None,
|
||||
init_attn_h=None,
|
||||
maximum_iterations=None,
|
||||
batch_size=None):
|
||||
|
||||
def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument
|
||||
return tf.less(step, maximum_iterations)
|
||||
|
||||
def _body(step, cache, inputs, outputs, attention_x, attention_h):
|
||||
# output: [1, 1, num_mels * r]
|
||||
# attn: [1, 1, T_out]
|
||||
output, cache, attn_x, attn_h = step_fn(
|
||||
step, inputs, cache) # outputs, cache, attention, attns
|
||||
for layer in range(len(attention_x)):
|
||||
attention_x[layer] = attention_x[layer].write(
|
||||
step, tf.cast(attn_x[layer], tf.float32))
|
||||
|
||||
for layer in range(len(attention_h)):
|
||||
attention_h[layer] = attention_h[layer].write(
|
||||
step, tf.cast(attn_h[layer], tf.float32))
|
||||
|
||||
outputs = outputs.write(step, tf.cast(output, tf.float32))
|
||||
return step + 1, cache, output[:, :, -self.
|
||||
num_mels:], outputs, attention_x, attention_h
|
||||
|
||||
step = tf.constant(0, dtype=tf.int32)
|
||||
outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
|
||||
|
||||
_, cache, _, outputs, attention_x, attention_h = tf.while_loop(
|
||||
_cond,
|
||||
_body,
|
||||
loop_vars=(step, init_cache, init_decoder_input, outputs,
|
||||
init_attn_x, init_attn_h),
|
||||
shape_invariants=(step.shape,
|
||||
compat.nest.map_structure(
|
||||
self._get_shape_invariants, init_cache),
|
||||
compat.nest.map_structure(
|
||||
self._get_shape_invariants,
|
||||
init_decoder_input), tf.TensorShape(None),
|
||||
compat.nest.map_structure(
|
||||
self._get_shape_invariants, init_attn_x),
|
||||
compat.nest.map_structure(
|
||||
self._get_shape_invariants, init_attn_h)),
|
||||
parallel_iterations=1,
|
||||
back_prop=False,
|
||||
maximum_iterations=maximum_iterations)
|
||||
# element of outputs: [N, 1, num_mels * r]
|
||||
outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r]
|
||||
outputs_stack = tf.transpose(
|
||||
outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r]
|
||||
outputs_stack = tf.squeeze(
|
||||
outputs_stack, axis=0) # [N, T_out, num_mels * r]
|
||||
|
||||
attention_x_stack = []
|
||||
for layer in range(len(attention_x)):
|
||||
attention_x_stack_tmp = attention_x[layer].stack(
|
||||
) # [T_out, N, H, 1, T_out]
|
||||
attention_x_stack_tmp = tf.transpose(
|
||||
attention_x_stack_tmp, perm=[3, 1, 2, 0,
|
||||
4]) # [1, N, H, T_out, T_out]
|
||||
attention_x_stack_tmp = tf.squeeze(
|
||||
attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out]
|
||||
attention_x_stack.append(attention_x_stack_tmp)
|
||||
|
||||
attention_h_stack = []
|
||||
for layer in range(len(attention_h)):
|
||||
attention_h_stack_tmp = attention_h[layer].stack(
|
||||
) # [T_out, N, H, 1, T_out]
|
||||
attention_h_stack_tmp = tf.transpose(
|
||||
attention_h_stack_tmp, perm=[3, 1, 2, 0,
|
||||
4]) # [1, N, H, T_out, T_out]
|
||||
attention_h_stack_tmp = tf.squeeze(
|
||||
attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out]
|
||||
attention_h_stack.append(attention_h_stack_tmp)
|
||||
|
||||
return outputs_stack, attention_x_stack, attention_h_stack, cache
|
||||
|
||||
def dynamic_decode_teacher_forcing(self,
|
||||
step_fn,
|
||||
decoder_input,
|
||||
init_cache=None,
|
||||
init_attn_x=None,
|
||||
init_attn_h=None,
|
||||
maximum_iterations=None,
|
||||
batch_size=None):
|
||||
|
||||
def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument
|
||||
return tf.less(step, maximum_iterations)
|
||||
|
||||
def _body(step, cache, inputs, outputs, attention_x, attention_h):
|
||||
# output: [1, 1, num_mels * r]
|
||||
# attn: [1, 1, T_out]
|
||||
output, cache, attn_x, attn_h = step_fn(
|
||||
step, inputs[:, step:step + 1, :],
|
||||
cache) # outputs, cache, attention, attns
|
||||
for layer in range(len(attention_x)):
|
||||
attention_x[layer] = attention_x[layer].write(
|
||||
step, tf.cast(attn_x[layer], tf.float32))
|
||||
|
||||
for layer in range(len(attention_h)):
|
||||
attention_h[layer] = attention_h[layer].write(
|
||||
step, tf.cast(attn_h[layer], tf.float32))
|
||||
outputs = outputs.write(step, tf.cast(output, tf.float32))
|
||||
return step + 1, cache, inputs, outputs, attention_x, attention_h
|
||||
|
||||
step = tf.constant(0, dtype=tf.int32)
|
||||
outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
|
||||
|
||||
_, cache, _, outputs, attention_x, attention_h = tf.while_loop(
|
||||
_cond,
|
||||
_body,
|
||||
loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x,
|
||||
init_attn_h),
|
||||
shape_invariants=(step.shape,
|
||||
compat.nest.map_structure(
|
||||
self._get_shape_invariants,
|
||||
init_cache), decoder_input.shape,
|
||||
tf.TensorShape(None),
|
||||
compat.nest.map_structure(
|
||||
self._get_shape_invariants, init_attn_x),
|
||||
compat.nest.map_structure(
|
||||
self._get_shape_invariants, init_attn_h)),
|
||||
parallel_iterations=1,
|
||||
back_prop=False,
|
||||
maximum_iterations=maximum_iterations)
|
||||
# element of outputs: [N, 1, num_mels * r]
|
||||
outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r]
|
||||
outputs_stack = tf.transpose(
|
||||
outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r]
|
||||
outputs_stack = tf.squeeze(
|
||||
outputs_stack, axis=0) # [N, T_out, num_mels * r]
|
||||
|
||||
attention_x_stack = []
|
||||
for layer in range(len(attention_x)):
|
||||
attention_x_stack_tmp = attention_x[layer].stack(
|
||||
) # [T_out, N, H, 1, T_out]
|
||||
attention_x_stack_tmp = tf.transpose(
|
||||
attention_x_stack_tmp, perm=[3, 1, 2, 0,
|
||||
4]) # [1, N, H, T_out, T_out]
|
||||
attention_x_stack_tmp = tf.squeeze(
|
||||
attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out]
|
||||
attention_x_stack.append(attention_x_stack_tmp)
|
||||
|
||||
attention_h_stack = []
|
||||
for layer in range(len(attention_h)):
|
||||
attention_h_stack_tmp = attention_h[layer].stack(
|
||||
) # [T_out, N, H, 1, T_out]
|
||||
attention_h_stack_tmp = tf.transpose(
|
||||
attention_h_stack_tmp, perm=[3, 1, 2, 0,
|
||||
4]) # [1, N, H, T_out, T_out]
|
||||
attention_h_stack_tmp = tf.squeeze(
|
||||
attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out]
|
||||
attention_h_stack.append(attention_h_stack_tmp)
|
||||
|
||||
return outputs_stack, attention_x_stack, attention_h_stack, cache
|
||||
|
||||
def _get_shape_invariants(self, tensor):
|
||||
"""Returns the shape of the tensor but sets middle dims to None."""
|
||||
if isinstance(tensor, tf.TensorArray):
|
||||
shape = None
|
||||
else:
|
||||
shape = tensor.shape.as_list()
|
||||
for i in range(1, len(shape) - 1):
|
||||
shape[i] = None
|
||||
return tf.TensorShape(shape)
|
||||
|
||||
|
||||
class SelfAttentionDecoderOri():
|
||||
"""Decoder using self-attention as described in
|
||||
https://arxiv.org/abs/1706.03762.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_layers,
|
||||
num_units=512,
|
||||
num_heads=8,
|
||||
ffn_inner_dim=2048,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
position_encoder=SinusoidalPositionEncoder(),
|
||||
self_attention_type='scaled_dot'):
|
||||
"""Initializes the parameters of the decoder.
|
||||
|
||||
Args:
|
||||
num_layers: The number of layers.
|
||||
num_units: The number of hidden units.
|
||||
num_heads: The number of heads in the multi-head attention.
|
||||
ffn_inner_dim: The number of units of the inner linear transformation
|
||||
in the feed forward layer.
|
||||
dropout: The probability to drop units from the outputs.
|
||||
attention_dropout: The probability to drop units from the attention.
|
||||
relu_dropout: The probability to drop units from the ReLU activation in
|
||||
the feed forward layer.
|
||||
position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
|
||||
apply on inputs or ``None``.
|
||||
self_attention_type: Type of self attention, "scaled_dot" or "average" (case
|
||||
insensitive).
|
||||
|
||||
Raises:
|
||||
ValueError: if :obj:`self_attention_type` is invalid.
|
||||
"""
|
||||
super(SelfAttentionDecoderOri, self).__init__()
|
||||
self.num_layers = num_layers
|
||||
self.num_units = num_units
|
||||
self.num_heads = num_heads
|
||||
self.ffn_inner_dim = ffn_inner_dim
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.relu_dropout = relu_dropout
|
||||
self.position_encoder = position_encoder
|
||||
self.self_attention_type = self_attention_type.lower()
|
||||
if self.self_attention_type not in ('scaled_dot', 'average'):
|
||||
raise ValueError('invalid attention type %s'
|
||||
% self.self_attention_type)
|
||||
if self.self_attention_type == 'average':
|
||||
tf.logging.warning(
|
||||
'Support for average attention network is experimental '
|
||||
'and may change in future versions.')
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
"""Returns the decoder output size."""
|
||||
return self.num_units
|
||||
|
||||
@property
|
||||
def support_alignment_history(self):
|
||||
return True
|
||||
|
||||
@property
|
||||
def support_multi_source(self):
|
||||
return True
|
||||
|
||||
def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
|
||||
cache = {}
|
||||
|
||||
for layer in range(self.num_layers):
|
||||
proj_cache_shape = [
|
||||
batch_size, self.num_heads, 0, self.num_units // self.num_heads
|
||||
]
|
||||
layer_cache = {}
|
||||
layer_cache['memory'] = [{
|
||||
'memory_keys':
|
||||
tf.zeros(proj_cache_shape, dtype=dtype),
|
||||
'memory_values':
|
||||
tf.zeros(proj_cache_shape, dtype=dtype)
|
||||
} for _ in range(num_sources)]
|
||||
if self.self_attention_type == 'scaled_dot':
|
||||
layer_cache['self_keys'] = tf.zeros(
|
||||
proj_cache_shape, dtype=dtype)
|
||||
layer_cache['self_values'] = tf.zeros(
|
||||
proj_cache_shape, dtype=dtype)
|
||||
elif self.self_attention_type == 'average':
|
||||
layer_cache['prev_g'] = tf.zeros(
|
||||
[batch_size, 1, self.num_units], dtype=dtype)
|
||||
cache['layer_{}'.format(layer)] = layer_cache
|
||||
|
||||
return cache
|
||||
|
||||
def _self_attention_stack(self,
|
||||
inputs,
|
||||
sequence_length=None,
|
||||
mode=True,
|
||||
cache=None,
|
||||
memory=None,
|
||||
memory_sequence_length=None,
|
||||
step=None):
|
||||
inputs *= self.num_units**0.5
|
||||
if self.position_encoder is not None:
|
||||
inputs = self.position_encoder(
|
||||
inputs, position=step + 1 if step is not None else None)
|
||||
|
||||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
|
||||
|
||||
decoder_mask = None
|
||||
memory_mask = None
|
||||
last_attention = None
|
||||
|
||||
if self.self_attention_type == 'scaled_dot':
|
||||
if sequence_length is not None:
|
||||
decoder_mask = transformer.build_future_mask(
|
||||
sequence_length,
|
||||
num_heads=self.num_heads,
|
||||
maximum_length=tf.shape(inputs)[1])
|
||||
elif self.self_attention_type == 'average':
|
||||
if cache is None:
|
||||
if sequence_length is None:
|
||||
sequence_length = tf.fill([tf.shape(inputs)[0]],
|
||||
tf.shape(inputs)[1])
|
||||
decoder_mask = transformer.cumulative_average_mask(
|
||||
sequence_length,
|
||||
maximum_length=tf.shape(inputs)[1],
|
||||
dtype=inputs.dtype)
|
||||
|
||||
if memory is not None and not tf.contrib.framework.nest.is_sequence(
|
||||
memory):
|
||||
memory = (memory, )
|
||||
if memory_sequence_length is not None:
|
||||
if not tf.contrib.framework.nest.is_sequence(
|
||||
memory_sequence_length):
|
||||
memory_sequence_length = (memory_sequence_length, )
|
||||
memory_mask = [
|
||||
transformer.build_sequence_mask(
|
||||
length,
|
||||
num_heads=self.num_heads,
|
||||
maximum_length=tf.shape(m)[1])
|
||||
for m, length in zip(memory, memory_sequence_length)
|
||||
]
|
||||
|
||||
for layer in range(self.num_layers):
|
||||
layer_name = 'layer_{}'.format(layer)
|
||||
layer_cache = cache[layer_name] if cache is not None else None
|
||||
with tf.variable_scope(layer_name):
|
||||
if self.self_attention_type == 'scaled_dot':
|
||||
with tf.variable_scope('masked_multi_head'):
|
||||
encoded = transformer.multi_head_attention(
|
||||
self.num_heads,
|
||||
transformer.norm(inputs),
|
||||
None,
|
||||
mode,
|
||||
num_units=self.num_units,
|
||||
mask=decoder_mask,
|
||||
cache=layer_cache,
|
||||
dropout=self.attention_dropout)
|
||||
last_context = transformer.drop_and_add(
|
||||
inputs, encoded, mode, dropout=self.dropout)
|
||||
elif self.self_attention_type == 'average':
|
||||
with tf.variable_scope('average_attention'):
|
||||
# Cumulative average.
|
||||
x = transformer.norm(inputs)
|
||||
y = transformer.cumulative_average(
|
||||
x,
|
||||
decoder_mask if cache is None else step,
|
||||
cache=layer_cache)
|
||||
# FFN.
|
||||
y = transformer.feed_forward(
|
||||
y,
|
||||
self.ffn_inner_dim,
|
||||
mode,
|
||||
dropout=self.relu_dropout)
|
||||
# Gating layer.
|
||||
z = tf.layers.dense(
|
||||
tf.concat([x, y], -1), self.num_units * 2)
|
||||
i, f = tf.split(z, 2, axis=-1)
|
||||
y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
|
||||
last_context = transformer.drop_and_add(
|
||||
inputs, y, mode, dropout=self.dropout)
|
||||
|
||||
if memory is not None:
|
||||
for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
|
||||
memory_cache = layer_cache['memory'][i] if layer_cache is not None else None # yapf:disable
|
||||
with tf.variable_scope('multi_head' if i
|
||||
== 0 else 'multi_head_%d' % i): # yapf:disable
|
||||
context, last_attention = transformer.multi_head_attention(
|
||||
self.num_heads,
|
||||
transformer.norm(last_context),
|
||||
mem,
|
||||
mode,
|
||||
mask=mask,
|
||||
cache=memory_cache,
|
||||
dropout=self.attention_dropout,
|
||||
return_attention=True)
|
||||
last_context = transformer.drop_and_add(
|
||||
last_context,
|
||||
context,
|
||||
mode,
|
||||
dropout=self.dropout)
|
||||
if i > 0: # Do not return attention in case of multi source.
|
||||
last_attention = None
|
||||
|
||||
with tf.variable_scope('ffn'):
|
||||
transformed = transformer.feed_forward_ori(
|
||||
transformer.norm(last_context),
|
||||
self.ffn_inner_dim,
|
||||
mode,
|
||||
dropout=self.relu_dropout)
|
||||
transformed = transformer.drop_and_add(
|
||||
last_context, transformed, mode, dropout=self.dropout)
|
||||
|
||||
inputs = transformed
|
||||
|
||||
if last_attention is not None:
|
||||
# The first head of the last layer is returned.
|
||||
first_head_attention = last_attention[:, 0]
|
||||
else:
|
||||
first_head_attention = None
|
||||
|
||||
outputs = transformer.norm(inputs)
|
||||
return outputs, first_head_attention
|
||||
|
||||
def decode_from_inputs(self,
|
||||
inputs,
|
||||
sequence_length,
|
||||
initial_state=None,
|
||||
mode=True,
|
||||
memory=None,
|
||||
memory_sequence_length=None):
|
||||
outputs, attention = self._self_attention_stack(
|
||||
inputs,
|
||||
sequence_length=sequence_length,
|
||||
mode=mode,
|
||||
memory=memory,
|
||||
memory_sequence_length=memory_sequence_length)
|
||||
return outputs, None, attention
|
||||
|
||||
def step_fn(self,
|
||||
mode,
|
||||
batch_size,
|
||||
initial_state=None,
|
||||
memory=None,
|
||||
memory_sequence_length=None,
|
||||
dtype=tf.float32):
|
||||
if memory is None:
|
||||
num_sources = 0
|
||||
elif tf.contrib.framework.nest.is_sequence(memory):
|
||||
num_sources = len(memory)
|
||||
else:
|
||||
num_sources = 1
|
||||
cache = self._init_cache(
|
||||
batch_size, dtype=dtype, num_sources=num_sources)
|
||||
|
||||
def _fn(step, inputs, cache, mode):
|
||||
inputs = tf.expand_dims(inputs, 1)
|
||||
outputs, attention = self._self_attention_stack(
|
||||
inputs,
|
||||
mode=mode,
|
||||
cache=cache,
|
||||
memory=memory,
|
||||
memory_sequence_length=memory_sequence_length,
|
||||
step=step)
|
||||
outputs = tf.squeeze(outputs, axis=1)
|
||||
if attention is not None:
|
||||
attention = tf.squeeze(attention, axis=1)
|
||||
return outputs, cache, attention
|
||||
|
||||
return _fn, cache
|
||||
182
modelscope/models/audio/tts/am/models/self_attention_encoder.py
Executable file
182
modelscope/models/audio/tts/am/models/self_attention_encoder.py
Executable file
@@ -0,0 +1,182 @@
|
||||
"""Define the self-attention encoder."""
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from . import transformer
|
||||
from .position import SinusoidalPositionEncoder
|
||||
|
||||
|
||||
class SelfAttentionEncoder():
|
||||
"""Encoder using self-attention as described in
|
||||
https://arxiv.org/abs/1706.03762.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_layers,
|
||||
num_units=512,
|
||||
num_heads=8,
|
||||
ffn_inner_dim=2048,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
position_encoder=SinusoidalPositionEncoder()):
|
||||
"""Initializes the parameters of the encoder.
|
||||
|
||||
Args:
|
||||
num_layers: The number of layers.
|
||||
num_units: The number of hidden units.
|
||||
num_heads: The number of heads in the multi-head attention.
|
||||
ffn_inner_dim: The number of units of the inner linear transformation
|
||||
in the feed forward layer.
|
||||
dropout: The probability to drop units from the outputs.
|
||||
attention_dropout: The probability to drop units from the attention.
|
||||
relu_dropout: The probability to drop units from the ReLU activation in
|
||||
the feed forward layer.
|
||||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
|
||||
apply on inputs or ``None``.
|
||||
"""
|
||||
super(SelfAttentionEncoder, self).__init__()
|
||||
self.num_layers = num_layers
|
||||
self.num_units = num_units
|
||||
self.num_heads = num_heads
|
||||
self.ffn_inner_dim = ffn_inner_dim
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.relu_dropout = relu_dropout
|
||||
self.position_encoder = position_encoder
|
||||
|
||||
def encode(self, inputs, sequence_length=None, mode=True):
|
||||
inputs *= self.num_units**0.5
|
||||
if self.position_encoder is not None:
|
||||
inputs = self.position_encoder(inputs)
|
||||
|
||||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
|
||||
mask = transformer.build_sequence_mask(
|
||||
sequence_length,
|
||||
num_heads=self.num_heads,
|
||||
maximum_length=tf.shape(inputs)[1])
|
||||
|
||||
mask_FF = tf.squeeze(
|
||||
transformer.build_sequence_mask(
|
||||
sequence_length, maximum_length=tf.shape(inputs)[1]),
|
||||
axis=1)
|
||||
|
||||
state = ()
|
||||
|
||||
attns = []
|
||||
for layer in range(self.num_layers):
|
||||
with tf.variable_scope('layer_{}'.format(layer)):
|
||||
with tf.variable_scope('multi_head'):
|
||||
context, attn = transformer.multi_head_attention(
|
||||
self.num_heads,
|
||||
transformer.norm(inputs),
|
||||
None,
|
||||
mode,
|
||||
num_units=self.num_units,
|
||||
mask=mask,
|
||||
dropout=self.attention_dropout,
|
||||
return_attention=True)
|
||||
attns.append(attn)
|
||||
context = transformer.drop_and_add(
|
||||
inputs, context, mode, dropout=self.dropout)
|
||||
|
||||
with tf.variable_scope('ffn'):
|
||||
transformed = transformer.feed_forward(
|
||||
transformer.norm(context),
|
||||
self.ffn_inner_dim,
|
||||
mode,
|
||||
dropout=self.relu_dropout,
|
||||
mask=mask_FF)
|
||||
transformed = transformer.drop_and_add(
|
||||
context, transformed, mode, dropout=self.dropout)
|
||||
|
||||
inputs = transformed
|
||||
state += (tf.reduce_mean(inputs, axis=1), )
|
||||
|
||||
outputs = transformer.norm(inputs)
|
||||
return (outputs, state, sequence_length, attns)
|
||||
|
||||
|
||||
class SelfAttentionEncoderOri():
|
||||
"""Encoder using self-attention as described in
|
||||
https://arxiv.org/abs/1706.03762.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_layers,
|
||||
num_units=512,
|
||||
num_heads=8,
|
||||
ffn_inner_dim=2048,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
relu_dropout=0.1,
|
||||
position_encoder=SinusoidalPositionEncoder()):
|
||||
"""Initializes the parameters of the encoder.
|
||||
|
||||
Args:
|
||||
num_layers: The number of layers.
|
||||
num_units: The number of hidden units.
|
||||
num_heads: The number of heads in the multi-head attention.
|
||||
ffn_inner_dim: The number of units of the inner linear transformation
|
||||
in the feed forward layer.
|
||||
dropout: The probability to drop units from the outputs.
|
||||
attention_dropout: The probability to drop units from the attention.
|
||||
relu_dropout: The probability to drop units from the ReLU activation in
|
||||
the feed forward layer.
|
||||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
|
||||
apply on inputs or ``None``.
|
||||
"""
|
||||
super(SelfAttentionEncoderOri, self).__init__()
|
||||
self.num_layers = num_layers
|
||||
self.num_units = num_units
|
||||
self.num_heads = num_heads
|
||||
self.ffn_inner_dim = ffn_inner_dim
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.relu_dropout = relu_dropout
|
||||
self.position_encoder = position_encoder
|
||||
|
||||
def encode(self, inputs, sequence_length=None, mode=True):
|
||||
inputs *= self.num_units**0.5
|
||||
if self.position_encoder is not None:
|
||||
inputs = self.position_encoder(inputs)
|
||||
|
||||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
|
||||
mask = transformer.build_sequence_mask(
|
||||
sequence_length,
|
||||
num_heads=self.num_heads,
|
||||
maximum_length=tf.shape(inputs)[1]) # [N, 1, 1, T_out]
|
||||
|
||||
state = ()
|
||||
|
||||
attns = []
|
||||
for layer in range(self.num_layers):
|
||||
with tf.variable_scope('layer_{}'.format(layer)):
|
||||
with tf.variable_scope('multi_head'):
|
||||
context, attn = transformer.multi_head_attention(
|
||||
self.num_heads,
|
||||
transformer.norm(inputs),
|
||||
None,
|
||||
mode,
|
||||
num_units=self.num_units,
|
||||
mask=mask,
|
||||
dropout=self.attention_dropout,
|
||||
return_attention=True)
|
||||
attns.append(attn)
|
||||
context = transformer.drop_and_add(
|
||||
inputs, context, mode, dropout=self.dropout)
|
||||
|
||||
with tf.variable_scope('ffn'):
|
||||
transformed = transformer.feed_forward_ori(
|
||||
transformer.norm(context),
|
||||
self.ffn_inner_dim,
|
||||
mode,
|
||||
dropout=self.relu_dropout)
|
||||
transformed = transformer.drop_and_add(
|
||||
context, transformed, mode, dropout=self.dropout)
|
||||
|
||||
inputs = transformed
|
||||
state += (tf.reduce_mean(inputs, axis=1), )
|
||||
|
||||
outputs = transformer.norm(inputs)
|
||||
return (outputs, state, sequence_length, attns)
|
||||
1157
modelscope/models/audio/tts/am/models/transformer.py
Executable file
1157
modelscope/models/audio/tts/am/models/transformer.py
Executable file
File diff suppressed because it is too large
Load Diff
255
modelscope/models/audio/tts/am/sambert_hifi_16k.py
Normal file
255
modelscope/models/audio/tts/am/sambert_hifi_16k.py
Normal file
@@ -0,0 +1,255 @@
|
||||
import io
|
||||
import os
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
|
||||
from modelscope.models.base import Model
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from .models import create_model
|
||||
from .text.symbols import load_symbols
|
||||
from .text.symbols_dict import SymbolsDict
|
||||
|
||||
__all__ = ['SambertNetHifi16k']
|
||||
|
||||
|
||||
def multi_label_symbol_to_sequence(my_classes, my_symbol):
|
||||
one_hot = MultiLabelBinarizer(my_classes)
|
||||
tokens = my_symbol.strip().split(' ')
|
||||
sequences = []
|
||||
for token in tokens:
|
||||
sequences.append(tuple(token.split('&')))
|
||||
# sequences.append(tuple(['~'])) # sequence length minus 1 to ignore EOS ~
|
||||
return one_hot.fit_transform(sequences)
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.text_to_speech, module_name=r'sambert_hifi_16k')
|
||||
class SambertNetHifi16k(Model):
|
||||
|
||||
def __init__(self,
|
||||
model_dir,
|
||||
pitch_control_str='',
|
||||
duration_control_str='',
|
||||
energy_control_str='',
|
||||
*args,
|
||||
**kwargs):
|
||||
tf.reset_default_graph()
|
||||
local_ckpt_path = os.path.join(ModelFile.TF_CHECKPOINT_FOLDER, 'ckpt')
|
||||
self._ckpt_path = os.path.join(model_dir, local_ckpt_path)
|
||||
self._dict_path = os.path.join(model_dir, 'dicts')
|
||||
self._hparams = tf.contrib.training.HParams(**kwargs)
|
||||
values = self._hparams.values()
|
||||
hp = [' {}:{}'.format(name, values[name]) for name in sorted(values)]
|
||||
print('Hyperparameters:\n' + '\n'.join(hp))
|
||||
super().__init__(self._ckpt_path, *args, **kwargs)
|
||||
model_name = 'robutrans'
|
||||
self._lfeat_type_list = self._hparams.lfeat_type_list.strip().split(
|
||||
',')
|
||||
sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols(
|
||||
self._dict_path)
|
||||
self._sy = sy
|
||||
self._tone = tone
|
||||
self._syllable_flag = syllable_flag
|
||||
self._word_segment = word_segment
|
||||
self._emo_category = emo_category
|
||||
self._speaker = speaker
|
||||
self._inputs_dim = dict()
|
||||
for lfeat_type in self._lfeat_type_list:
|
||||
if lfeat_type == 'sy':
|
||||
self._inputs_dim[lfeat_type] = len(sy)
|
||||
elif lfeat_type == 'tone':
|
||||
self._inputs_dim[lfeat_type] = len(tone)
|
||||
elif lfeat_type == 'syllable_flag':
|
||||
self._inputs_dim[lfeat_type] = len(syllable_flag)
|
||||
elif lfeat_type == 'word_segment':
|
||||
self._inputs_dim[lfeat_type] = len(word_segment)
|
||||
elif lfeat_type == 'emo_category':
|
||||
self._inputs_dim[lfeat_type] = len(emo_category)
|
||||
elif lfeat_type == 'speaker':
|
||||
self._inputs_dim[lfeat_type] = len(speaker)
|
||||
|
||||
self._symbols_dict = SymbolsDict(sy, tone, syllable_flag, word_segment,
|
||||
emo_category, speaker,
|
||||
self._inputs_dim,
|
||||
self._lfeat_type_list)
|
||||
dim_inputs = sum(self._inputs_dim.values(
|
||||
)) - self._inputs_dim['speaker'] - self._inputs_dim['emo_category']
|
||||
inputs = tf.placeholder(tf.float32, [1, None, dim_inputs], 'inputs')
|
||||
inputs_emotion = tf.placeholder(
|
||||
tf.float32, [1, None, self._inputs_dim['emo_category']],
|
||||
'inputs_emotion')
|
||||
inputs_speaker = tf.placeholder(tf.float32,
|
||||
[1, None, self._inputs_dim['speaker']],
|
||||
'inputs_speaker')
|
||||
|
||||
input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
|
||||
pitch_contours_scale = tf.placeholder(tf.float32, [1, None],
|
||||
'pitch_contours_scale')
|
||||
energy_contours_scale = tf.placeholder(tf.float32, [1, None],
|
||||
'energy_contours_scale')
|
||||
duration_scale = tf.placeholder(tf.float32, [1, None],
|
||||
'duration_scale')
|
||||
|
||||
with tf.variable_scope('model') as _:
|
||||
self._model = create_model(model_name, self._hparams)
|
||||
self._model.initialize(
|
||||
inputs,
|
||||
inputs_emotion,
|
||||
inputs_speaker,
|
||||
input_lengths,
|
||||
duration_scales=duration_scale,
|
||||
pitch_scales=pitch_contours_scale,
|
||||
energy_scales=energy_contours_scale)
|
||||
self._mel_spec = self._model.mel_outputs[0]
|
||||
self._duration_outputs = self._model.duration_outputs[0]
|
||||
self._duration_outputs_ = self._model.duration_outputs_[0]
|
||||
self._pitch_contour_outputs = self._model.pitch_contour_outputs[0]
|
||||
self._energy_contour_outputs = self._model.energy_contour_outputs[
|
||||
0]
|
||||
self._embedded_inputs_emotion = self._model.embedded_inputs_emotion[
|
||||
0]
|
||||
self._embedding_fsmn_outputs = self._model.embedding_fsmn_outputs[
|
||||
0]
|
||||
self._encoder_outputs = self._model.encoder_outputs[0]
|
||||
self._pitch_embeddings = self._model.pitch_embeddings[0]
|
||||
self._energy_embeddings = self._model.energy_embeddings[0]
|
||||
self._LR_outputs = self._model.LR_outputs[0]
|
||||
self._postnet_fsmn_outputs = self._model.postnet_fsmn_outputs[0]
|
||||
self._attention_h = self._model.attention_h
|
||||
self._attention_x = self._model.attention_x
|
||||
|
||||
print('Loading checkpoint: %s' % self._ckpt_path)
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
self._session = tf.Session(config=config)
|
||||
self._session.run(tf.global_variables_initializer())
|
||||
|
||||
saver = tf.train.Saver()
|
||||
saver.restore(self._session, self._ckpt_path)
|
||||
|
||||
duration_cfg_lst = []
|
||||
if len(duration_control_str) != 0:
|
||||
for item in duration_control_str.strip().split('|'):
|
||||
percent, scale = item.lstrip('(').rstrip(')').split(',')
|
||||
duration_cfg_lst.append((float(percent), float(scale)))
|
||||
|
||||
self._duration_cfg_lst = duration_cfg_lst
|
||||
|
||||
pitch_contours_cfg_lst = []
|
||||
if len(pitch_control_str) != 0:
|
||||
for item in pitch_control_str.strip().split('|'):
|
||||
percent, scale = item.lstrip('(').rstrip(')').split(',')
|
||||
pitch_contours_cfg_lst.append(
|
||||
(float(percent), float(scale)))
|
||||
|
||||
self._pitch_contours_cfg_lst = pitch_contours_cfg_lst
|
||||
|
||||
energy_contours_cfg_lst = []
|
||||
if len(energy_control_str) != 0:
|
||||
for item in energy_control_str.strip().split('|'):
|
||||
percent, scale = item.lstrip('(').rstrip(')').split(',')
|
||||
energy_contours_cfg_lst.append(
|
||||
(float(percent), float(scale)))
|
||||
|
||||
self._energy_contours_cfg_lst = energy_contours_cfg_lst
|
||||
|
||||
def forward(self, text):
|
||||
cleaner_names = [x.strip() for x in self._hparams.cleaners.split(',')]
|
||||
|
||||
lfeat_symbol = text.strip().split(' ')
|
||||
lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list))
|
||||
for this_lfeat_symbol in lfeat_symbol:
|
||||
this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
|
||||
'$')
|
||||
if len(this_lfeat_symbol) != len(self._lfeat_type_list):
|
||||
raise Exception(
|
||||
'Length of this_lfeat_symbol in training data'
|
||||
+ ' is not equal to the length of lfeat_type_list, '
|
||||
+ str(len(this_lfeat_symbol)) + ' VS. '
|
||||
+ str(len(self._lfeat_type_list)))
|
||||
index = 0
|
||||
while index < len(lfeat_symbol_separate):
|
||||
lfeat_symbol_separate[index] = lfeat_symbol_separate[
|
||||
index] + this_lfeat_symbol[index] + ' '
|
||||
index = index + 1
|
||||
|
||||
index = 0
|
||||
lfeat_type = self._lfeat_type_list[index]
|
||||
sequence = self._symbols_dict.symbol_to_sequence(
|
||||
lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names)
|
||||
sequence_array = np.asarray(
|
||||
sequence[:-1],
|
||||
dtype=np.int32) # sequence length minus 1 to ignore EOS ~
|
||||
inputs = np.eye(
|
||||
self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
|
||||
index = index + 1
|
||||
while index < len(self._lfeat_type_list) - 2:
|
||||
lfeat_type = self._lfeat_type_list[index]
|
||||
sequence = self._symbols_dict.symbol_to_sequence(
|
||||
lfeat_symbol_separate[index].strip(), lfeat_type,
|
||||
cleaner_names)
|
||||
sequence_array = np.asarray(
|
||||
sequence[:-1],
|
||||
dtype=np.int32) # sequence length minus 1 to ignore EOS ~
|
||||
inputs_temp = np.eye(
|
||||
self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
|
||||
inputs = np.concatenate((inputs, inputs_temp), axis=1)
|
||||
index = index + 1
|
||||
seq = inputs
|
||||
|
||||
lfeat_type = 'emo_category'
|
||||
inputs_emotion = multi_label_symbol_to_sequence(
|
||||
self._emo_category, lfeat_symbol_separate[index].strip())
|
||||
# inputs_emotion = inputs_emotion * 1.5
|
||||
index = index + 1
|
||||
|
||||
lfeat_type = 'speaker'
|
||||
inputs_speaker = multi_label_symbol_to_sequence(
|
||||
self._speaker, lfeat_symbol_separate[index].strip())
|
||||
|
||||
duration_scale = np.ones((len(seq), ), dtype=np.float32)
|
||||
start_idx = 0
|
||||
for (percent, scale) in self._duration_cfg_lst:
|
||||
duration_scale[start_idx:start_idx
|
||||
+ int(percent * len(seq))] = scale
|
||||
start_idx += int(percent * len(seq))
|
||||
|
||||
pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32)
|
||||
start_idx = 0
|
||||
for (percent, scale) in self._pitch_contours_cfg_lst:
|
||||
pitch_contours_scale[start_idx:start_idx
|
||||
+ int(percent * len(seq))] = scale
|
||||
start_idx += int(percent * len(seq))
|
||||
|
||||
energy_contours_scale = np.ones((len(seq), ), dtype=np.float32)
|
||||
start_idx = 0
|
||||
for (percent, scale) in self._energy_contours_cfg_lst:
|
||||
energy_contours_scale[start_idx:start_idx
|
||||
+ int(percent * len(seq))] = scale
|
||||
start_idx += int(percent * len(seq))
|
||||
|
||||
feed_dict = {
|
||||
self._model.inputs: [np.asarray(seq, dtype=np.float32)],
|
||||
self._model.inputs_emotion:
|
||||
[np.asarray(inputs_emotion, dtype=np.float32)],
|
||||
self._model.inputs_speaker:
|
||||
[np.asarray(inputs_speaker, dtype=np.float32)],
|
||||
self._model.input_lengths:
|
||||
np.asarray([len(seq)], dtype=np.int32),
|
||||
self._model.duration_scales: [duration_scale],
|
||||
self._model.pitch_scales: [pitch_contours_scale],
|
||||
self._model.energy_scales: [energy_contours_scale]
|
||||
}
|
||||
|
||||
result = self._session.run([
|
||||
self._mel_spec, self._duration_outputs, self._duration_outputs_,
|
||||
self._pitch_contour_outputs, self._embedded_inputs_emotion,
|
||||
self._embedding_fsmn_outputs, self._encoder_outputs,
|
||||
self._pitch_embeddings, self._LR_outputs,
|
||||
self._postnet_fsmn_outputs, self._energy_contour_outputs,
|
||||
self._energy_embeddings, self._attention_x, self._attention_h
|
||||
], feed_dict=feed_dict) # yapf:disable
|
||||
return result[0]
|
||||
0
modelscope/models/audio/tts/am/text/__init__.py
Executable file
0
modelscope/models/audio/tts/am/text/__init__.py
Executable file
89
modelscope/models/audio/tts/am/text/cleaners.py
Executable file
89
modelscope/models/audio/tts/am/text/cleaners.py
Executable file
@@ -0,0 +1,89 @@
|
||||
'''
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from unidecode import unidecode
|
||||
|
||||
from .numbers import normalize_numbers
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'), ]] # yapf:disable
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def expand_numbers(text):
|
||||
return normalize_numbers(text)
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, ' ', text)
|
||||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
return unidecode(text)
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def transliteration_cleaners(text):
|
||||
'''Pipeline for non-English text that transliterates to ASCII.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners(text):
|
||||
'''Pipeline for English text, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_numbers(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
64
modelscope/models/audio/tts/am/text/cmudict.py
Executable file
64
modelscope/models/audio/tts/am/text/cmudict.py
Executable file
@@ -0,0 +1,64 @@
|
||||
import re
|
||||
|
||||
valid_symbols = [
|
||||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
|
||||
'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
|
||||
'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
|
||||
'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
|
||||
'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
|
||||
'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
|
||||
'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
|
||||
'Y', 'Z', 'ZH'
|
||||
]
|
||||
|
||||
_valid_symbol_set = set(valid_symbols)
|
||||
|
||||
|
||||
class CMUDict:
|
||||
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
|
||||
|
||||
def __init__(self, file_or_path, keep_ambiguous=True):
|
||||
if isinstance(file_or_path, str):
|
||||
with open(file_or_path, encoding='latin-1') as f:
|
||||
entries = _parse_cmudict(f)
|
||||
else:
|
||||
entries = _parse_cmudict(file_or_path)
|
||||
if not keep_ambiguous:
|
||||
entries = {
|
||||
word: pron
|
||||
for word, pron in entries.items() if len(pron) == 1
|
||||
}
|
||||
self._entries = entries
|
||||
|
||||
def __len__(self):
|
||||
return len(self._entries)
|
||||
|
||||
def lookup(self, word):
|
||||
'''Returns list of ARPAbet pronunciations of the given word.'''
|
||||
return self._entries.get(word.upper())
|
||||
|
||||
|
||||
_alt_re = re.compile(r'\([0-9]+\)')
|
||||
|
||||
|
||||
def _parse_cmudict(file):
|
||||
cmudict = {}
|
||||
for line in file:
|
||||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
|
||||
parts = line.split(' ')
|
||||
word = re.sub(_alt_re, '', parts[0])
|
||||
pronunciation = _get_pronunciation(parts[1])
|
||||
if pronunciation:
|
||||
if word in cmudict:
|
||||
cmudict[word].append(pronunciation)
|
||||
else:
|
||||
cmudict[word] = [pronunciation]
|
||||
return cmudict
|
||||
|
||||
|
||||
def _get_pronunciation(s):
|
||||
parts = s.strip().split(' ')
|
||||
for part in parts:
|
||||
if part not in _valid_symbol_set:
|
||||
return None
|
||||
return ' '.join(parts)
|
||||
70
modelscope/models/audio/tts/am/text/numbers.py
Executable file
70
modelscope/models/audio/tts/am/text/numbers.py
Executable file
@@ -0,0 +1,70 @@
|
||||
import re
|
||||
|
||||
import inflect
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(',', '')
|
||||
|
||||
|
||||
def _expand_decimal_point(m):
|
||||
return m.group(1).replace('.', ' point ')
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
if len(parts) > 2:
|
||||
return match + ' dollars' # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
else:
|
||||
return 'zero dollars'
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return 'two thousand'
|
||||
elif num > 2000 and num < 2010:
|
||||
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
else:
|
||||
return _inflect.number_to_words(
|
||||
num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
|
||||
|
||||
def normalize_numbers(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r'\1 pounds', text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
||||
95
modelscope/models/audio/tts/am/text/symbols.py
Normal file
95
modelscope/models/audio/tts/am/text/symbols.py
Normal file
@@ -0,0 +1,95 @@
|
||||
'''
|
||||
Defines the set of symbols used in text input to the model.
|
||||
|
||||
The default is a set of ASCII characters that works well for English or text that has been run
|
||||
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
|
||||
'''
|
||||
import codecs
|
||||
import os
|
||||
|
||||
_pad = '_'
|
||||
_eos = '~'
|
||||
_mask = '@[MASK]'
|
||||
|
||||
|
||||
def load_symbols(dict_path):
|
||||
_characters = ''
|
||||
_ch_symbols = []
|
||||
sy_dict_name = 'sy_dict.txt'
|
||||
sy_dict_path = os.path.join(dict_path, sy_dict_name)
|
||||
f = codecs.open(sy_dict_path, 'r')
|
||||
for line in f:
|
||||
line = line.strip('\r\n')
|
||||
_ch_symbols.append(line)
|
||||
|
||||
_arpabet = ['@' + s for s in _ch_symbols]
|
||||
|
||||
# Export all symbols:
|
||||
sy = list(_characters) + _arpabet + [_pad, _eos, _mask]
|
||||
|
||||
_characters = ''
|
||||
|
||||
_ch_tones = []
|
||||
tone_dict_name = 'tone_dict.txt'
|
||||
tone_dict_path = os.path.join(dict_path, tone_dict_name)
|
||||
f = codecs.open(tone_dict_path, 'r')
|
||||
for line in f:
|
||||
line = line.strip('\r\n')
|
||||
_ch_tones.append(line)
|
||||
|
||||
# Export all tones:
|
||||
tone = list(_characters) + _ch_tones + [_pad, _eos, _mask]
|
||||
|
||||
_characters = ''
|
||||
|
||||
_ch_syllable_flags = []
|
||||
syllable_flag_name = 'syllable_flag_dict.txt'
|
||||
syllable_flag_path = os.path.join(dict_path, syllable_flag_name)
|
||||
f = codecs.open(syllable_flag_path, 'r')
|
||||
for line in f:
|
||||
line = line.strip('\r\n')
|
||||
_ch_syllable_flags.append(line)
|
||||
|
||||
# Export all syllable_flags:
|
||||
syllable_flag = list(_characters) + _ch_syllable_flags + [
|
||||
_pad, _eos, _mask
|
||||
]
|
||||
|
||||
_characters = ''
|
||||
|
||||
_ch_word_segments = []
|
||||
word_segment_name = 'word_segment_dict.txt'
|
||||
word_segment_path = os.path.join(dict_path, word_segment_name)
|
||||
f = codecs.open(word_segment_path, 'r')
|
||||
for line in f:
|
||||
line = line.strip('\r\n')
|
||||
_ch_word_segments.append(line)
|
||||
|
||||
# Export all syllable_flags:
|
||||
word_segment = list(_characters) + _ch_word_segments + [_pad, _eos, _mask]
|
||||
|
||||
_characters = ''
|
||||
|
||||
_ch_emo_types = []
|
||||
emo_category_name = 'emo_category_dict.txt'
|
||||
emo_category_path = os.path.join(dict_path, emo_category_name)
|
||||
f = codecs.open(emo_category_path, 'r')
|
||||
for line in f:
|
||||
line = line.strip('\r\n')
|
||||
_ch_emo_types.append(line)
|
||||
|
||||
emo_category = list(_characters) + _ch_emo_types + [_pad, _eos, _mask]
|
||||
|
||||
_characters = ''
|
||||
|
||||
_ch_speakers = []
|
||||
speaker_name = 'speaker_dict.txt'
|
||||
speaker_path = os.path.join(dict_path, speaker_name)
|
||||
f = codecs.open(speaker_path, 'r')
|
||||
for line in f:
|
||||
line = line.strip('\r\n')
|
||||
_ch_speakers.append(line)
|
||||
|
||||
# Export all syllable_flags:
|
||||
speaker = list(_characters) + _ch_speakers + [_pad, _eos, _mask]
|
||||
return sy, tone, syllable_flag, word_segment, emo_category, speaker
|
||||
200
modelscope/models/audio/tts/am/text/symbols_dict.py
Normal file
200
modelscope/models/audio/tts/am/text/symbols_dict.py
Normal file
@@ -0,0 +1,200 @@
|
||||
import re
|
||||
import sys
|
||||
|
||||
from .cleaners import (basic_cleaners, english_cleaners,
|
||||
transliteration_cleaners)
|
||||
|
||||
|
||||
class SymbolsDict:
|
||||
|
||||
def __init__(self, sy, tone, syllable_flag, word_segment, emo_category,
|
||||
speaker, inputs_dim, lfeat_type_list):
|
||||
self._inputs_dim = inputs_dim
|
||||
self._lfeat_type_list = lfeat_type_list
|
||||
self._sy_to_id = {s: i for i, s in enumerate(sy)}
|
||||
self._id_to_sy = {i: s for i, s in enumerate(sy)}
|
||||
self._tone_to_id = {s: i for i, s in enumerate(tone)}
|
||||
self._id_to_tone = {i: s for i, s in enumerate(tone)}
|
||||
self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)}
|
||||
self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)}
|
||||
self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)}
|
||||
self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)}
|
||||
self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)}
|
||||
self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)}
|
||||
self._speaker_to_id = {s: i for i, s in enumerate(speaker)}
|
||||
self._id_to_speaker = {i: s for i, s in enumerate(speaker)}
|
||||
print('_sy_to_id: ')
|
||||
print(self._sy_to_id)
|
||||
print('_tone_to_id: ')
|
||||
print(self._tone_to_id)
|
||||
print('_syllable_flag_to_id: ')
|
||||
print(self._syllable_flag_to_id)
|
||||
print('_word_segment_to_id: ')
|
||||
print(self._word_segment_to_id)
|
||||
print('_emo_category_to_id: ')
|
||||
print(self._emo_category_to_id)
|
||||
print('_speaker_to_id: ')
|
||||
print(self._speaker_to_id)
|
||||
self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
||||
self._cleaners = {
|
||||
basic_cleaners.__name__: basic_cleaners,
|
||||
transliteration_cleaners.__name__: transliteration_cleaners,
|
||||
english_cleaners.__name__: english_cleaners
|
||||
}
|
||||
|
||||
def _clean_text(self, text, cleaner_names):
|
||||
for name in cleaner_names:
|
||||
cleaner = self._cleaners.get(name)
|
||||
if not cleaner:
|
||||
raise Exception('Unknown cleaner: %s' % name)
|
||||
text = cleaner(text)
|
||||
return text
|
||||
|
||||
def _sy_to_sequence(self, sy):
|
||||
return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)]
|
||||
|
||||
def _arpabet_to_sequence(self, text):
|
||||
return self._sy_to_sequence(['@' + s for s in text.split()])
|
||||
|
||||
def _should_keep_sy(self, s):
|
||||
return s in self._sy_to_id and s != '_' and s != '~'
|
||||
|
||||
def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names):
|
||||
sequence = []
|
||||
if lfeat_type == 'sy':
|
||||
this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
|
||||
this_lfeat_symbol_format = ''
|
||||
index = 0
|
||||
while index < len(this_lfeat_symbol):
|
||||
this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
|
||||
index] + '}' + ' '
|
||||
index = index + 1
|
||||
sequence = self.text_to_sequence(this_lfeat_symbol_format,
|
||||
cleaner_names)
|
||||
elif lfeat_type == 'tone':
|
||||
sequence = self.tone_to_sequence(this_lfeat_symbol)
|
||||
elif lfeat_type == 'syllable_flag':
|
||||
sequence = self.syllable_flag_to_sequence(this_lfeat_symbol)
|
||||
elif lfeat_type == 'word_segment':
|
||||
sequence = self.word_segment_to_sequence(this_lfeat_symbol)
|
||||
elif lfeat_type == 'emo_category':
|
||||
sequence = self.emo_category_to_sequence(this_lfeat_symbol)
|
||||
elif lfeat_type == 'speaker':
|
||||
sequence = self.speaker_to_sequence(this_lfeat_symbol)
|
||||
else:
|
||||
raise Exception('Unknown lfeat type: %s' % lfeat_type)
|
||||
|
||||
return sequence
|
||||
|
||||
def text_to_sequence(self, text, cleaner_names):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
|
||||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
||||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
||||
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
cleaner_names: names of the cleaner functions to run the text through
|
||||
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = []
|
||||
|
||||
# Check for curly braces and treat their contents as ARPAbet:
|
||||
while len(text):
|
||||
m = self._curly_re.match(text)
|
||||
if not m:
|
||||
sequence += self._sy_to_sequence(
|
||||
self._clean_text(text, cleaner_names))
|
||||
break
|
||||
sequence += self._sy_to_sequence(
|
||||
self._clean_text(m.group(1), cleaner_names))
|
||||
sequence += self._arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
|
||||
# Append EOS token
|
||||
sequence.append(self._sy_to_id['~'])
|
||||
return sequence
|
||||
|
||||
def tone_to_sequence(self, tone):
|
||||
tones = tone.strip().split(' ')
|
||||
sequence = []
|
||||
for this_tone in tones:
|
||||
sequence.append(self._tone_to_id[this_tone])
|
||||
sequence.append(self._tone_to_id['~'])
|
||||
return sequence
|
||||
|
||||
def syllable_flag_to_sequence(self, syllable_flag):
|
||||
syllable_flags = syllable_flag.strip().split(' ')
|
||||
sequence = []
|
||||
for this_syllable_flag in syllable_flags:
|
||||
sequence.append(self._syllable_flag_to_id[this_syllable_flag])
|
||||
sequence.append(self._syllable_flag_to_id['~'])
|
||||
return sequence
|
||||
|
||||
def word_segment_to_sequence(self, word_segment):
|
||||
word_segments = word_segment.strip().split(' ')
|
||||
sequence = []
|
||||
for this_word_segment in word_segments:
|
||||
sequence.append(self._word_segment_to_id[this_word_segment])
|
||||
sequence.append(self._word_segment_to_id['~'])
|
||||
return sequence
|
||||
|
||||
def emo_category_to_sequence(self, emo_type):
|
||||
emo_categories = emo_type.strip().split(' ')
|
||||
sequence = []
|
||||
for this_category in emo_categories:
|
||||
sequence.append(self._emo_category_to_id[this_category])
|
||||
sequence.append(self._emo_category_to_id['~'])
|
||||
return sequence
|
||||
|
||||
def speaker_to_sequence(self, speaker):
|
||||
speakers = speaker.strip().split(' ')
|
||||
sequence = []
|
||||
for this_speaker in speakers:
|
||||
sequence.append(self._speaker_to_id[this_speaker])
|
||||
sequence.append(self._speaker_to_id['~'])
|
||||
return sequence
|
||||
|
||||
def sequence_to_symbol(self, sequence):
|
||||
result = ''
|
||||
pre_lfeat_dim = 0
|
||||
for lfeat_type in self._lfeat_type_list:
|
||||
current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim
|
||||
+ self._inputs_dim[lfeat_type]]
|
||||
current_sequence = current_one_hot_sequence.argmax(1)
|
||||
length = current_sequence.shape[0]
|
||||
|
||||
index = 0
|
||||
while index < length:
|
||||
this_sequence = current_sequence[index]
|
||||
s = ''
|
||||
if lfeat_type == 'sy':
|
||||
s = self._id_to_sy[this_sequence]
|
||||
if len(s) > 1 and s[0] == '@':
|
||||
s = s[1:]
|
||||
elif lfeat_type == 'tone':
|
||||
s = self._id_to_tone[this_sequence]
|
||||
elif lfeat_type == 'syllable_flag':
|
||||
s = self._id_to_syllable_flag[this_sequence]
|
||||
elif lfeat_type == 'word_segment':
|
||||
s = self._id_to_word_segment[this_sequence]
|
||||
elif lfeat_type == 'emo_category':
|
||||
s = self._id_to_emo_category[this_sequence]
|
||||
elif lfeat_type == 'speaker':
|
||||
s = self._id_to_speaker[this_sequence]
|
||||
else:
|
||||
raise Exception('Unknown lfeat type: %s' % lfeat_type)
|
||||
|
||||
if index == 0:
|
||||
result = result + lfeat_type + ': '
|
||||
|
||||
result = result + '{' + s + '}'
|
||||
|
||||
if index == length - 1:
|
||||
result = result + '; '
|
||||
|
||||
index = index + 1
|
||||
pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type]
|
||||
return result
|
||||
1
modelscope/models/audio/tts/frontend/__init__.py
Normal file
1
modelscope/models/audio/tts/frontend/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .generic_text_to_speech_frontend import * # noqa F403
|
||||
@@ -0,0 +1,39 @@
|
||||
import os
|
||||
import zipfile
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from modelscope.models.base import Model
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.audio.tts_exceptions import (
|
||||
TtsFrontendInitializeFailedException,
|
||||
TtsFrontendLanguageTypeInvalidException)
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
__all__ = ['GenericTtsFrontend']
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.text_to_speech, module_name=r'generic_tts_frontend')
|
||||
class GenericTtsFrontend(Model):
|
||||
|
||||
def __init__(self, model_dir='.', lang_type='pinyin', *args, **kwargs):
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
import ttsfrd
|
||||
|
||||
frontend = ttsfrd.TtsFrontendEngine()
|
||||
zip_file = os.path.join(model_dir, 'resource.zip')
|
||||
self._res_path = os.path.join(model_dir, 'resource')
|
||||
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
|
||||
zip_ref.extractall(model_dir)
|
||||
if not frontend.initialize(self._res_path):
|
||||
raise TtsFrontendInitializeFailedException(
|
||||
'resource invalid: {}'.format(self._res_path))
|
||||
if not frontend.set_lang_type(lang_type):
|
||||
raise TtsFrontendLanguageTypeInvalidException(
|
||||
'language type invalid: {}, valid is pinyin and chenmix'.
|
||||
format(lang_type))
|
||||
self._frontend = frontend
|
||||
|
||||
def forward(self, data: str) -> Dict[str, List]:
|
||||
result = self._frontend.gen_tacotron_symbols(data)
|
||||
return {'texts': [s for s in result.splitlines() if s != '']}
|
||||
1
modelscope/models/audio/tts/vocoder/__init__.py
Normal file
1
modelscope/models/audio/tts/vocoder/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .hifigan16k import * # noqa F403
|
||||
73
modelscope/models/audio/tts/vocoder/hifigan16k.py
Normal file
73
modelscope/models/audio/tts/vocoder/hifigan16k.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from __future__ import (absolute_import, division, print_function,
|
||||
unicode_literals)
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import time
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
from modelscope.models.base import Model
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.audio.tts_exceptions import \
|
||||
TtsVocoderMelspecShapeMismatchException
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from .models import Generator
|
||||
|
||||
__all__ = ['Hifigan16k', 'AttrDict']
|
||||
MAX_WAV_VALUE = 32768.0
|
||||
|
||||
|
||||
def load_checkpoint(filepath, device):
|
||||
assert os.path.isfile(filepath)
|
||||
print("Loading '{}'".format(filepath))
|
||||
checkpoint_dict = torch.load(filepath, map_location=device)
|
||||
print('Complete.')
|
||||
return checkpoint_dict
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
self.__dict__ = self
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.text_to_speech, module_name=r'hifigan16k')
|
||||
class Hifigan16k(Model):
|
||||
|
||||
def __init__(self, model_dir, *args, **kwargs):
|
||||
self._ckpt_path = os.path.join(model_dir,
|
||||
ModelFile.TORCH_MODEL_BIN_FILE)
|
||||
self._config = AttrDict(**kwargs)
|
||||
|
||||
super().__init__(self._ckpt_path, *args, **kwargs)
|
||||
if torch.cuda.is_available():
|
||||
torch.manual_seed(self._config.seed)
|
||||
self._device = torch.device('cuda')
|
||||
else:
|
||||
self._device = torch.device('cpu')
|
||||
self._generator = Generator(self._config).to(self._device)
|
||||
state_dict_g = load_checkpoint(self._ckpt_path, self._device)
|
||||
self._generator.load_state_dict(state_dict_g['generator'])
|
||||
self._generator.eval()
|
||||
self._generator.remove_weight_norm()
|
||||
|
||||
def forward(self, melspec):
|
||||
dim0 = list(melspec.shape)[-1]
|
||||
if dim0 != 80:
|
||||
raise TtsVocoderMelspecShapeMismatchException(
|
||||
'input melspec mismatch 0 dim require 80 but {}'.format(dim0))
|
||||
with torch.no_grad():
|
||||
x = melspec.T
|
||||
x = torch.FloatTensor(x).to(self._device)
|
||||
if len(x.shape) == 2:
|
||||
x = x.unsqueeze(0)
|
||||
y_g_hat = self._generator(x)
|
||||
audio = y_g_hat.squeeze()
|
||||
audio = audio * MAX_WAV_VALUE
|
||||
audio = audio.cpu().numpy().astype('int16')
|
||||
return audio
|
||||
1
modelscope/models/audio/tts/vocoder/models/__init__.py
Normal file
1
modelscope/models/audio/tts/vocoder/models/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .models import Generator
|
||||
516
modelscope/models/audio/tts/vocoder/models/models.py
Executable file
516
modelscope/models/audio/tts/vocoder/models/models.py
Executable file
@@ -0,0 +1,516 @@
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
|
||||
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
|
||||
|
||||
from .utils import get_padding, init_weights
|
||||
|
||||
is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')
|
||||
|
||||
|
||||
def stft(x, fft_size, hop_size, win_length, window):
|
||||
"""Perform STFT and convert to magnitude spectrogram.
|
||||
|
||||
Args:
|
||||
x (Tensor): Input signal tensor (B, T).
|
||||
fft_size (int): FFT size.
|
||||
hop_size (int): Hop size.
|
||||
win_length (int): Window length.
|
||||
window (str): Window function type.
|
||||
|
||||
Returns:
|
||||
Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
|
||||
|
||||
"""
|
||||
if is_pytorch_17plus:
|
||||
x_stft = torch.stft(
|
||||
x, fft_size, hop_size, win_length, window, return_complex=False)
|
||||
else:
|
||||
x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
|
||||
real = x_stft[..., 0]
|
||||
imag = x_stft[..., 1]
|
||||
|
||||
# NOTE(kan-bayashi): clamp is needed to avoid nan or inf
|
||||
return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)
|
||||
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
||||
def get_padding_casual(kernel_size, dilation=1):
|
||||
return int(kernel_size * dilation - dilation)
|
||||
|
||||
|
||||
class Conv1dCasual(torch.nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=True,
|
||||
padding_mode='zeros'):
|
||||
super(Conv1dCasual, self).__init__()
|
||||
self.pad = padding
|
||||
self.conv1d = weight_norm(
|
||||
Conv1d(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding=0,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
bias=bias,
|
||||
padding_mode=padding_mode))
|
||||
self.conv1d.apply(init_weights)
|
||||
|
||||
def forward(self, x): # bdt
|
||||
# described starting from the last dimension and moving forward.
|
||||
x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
|
||||
x = self.conv1d(x)
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
remove_weight_norm(self.conv1d)
|
||||
|
||||
|
||||
class ConvTranspose1dCausal(torch.nn.Module):
|
||||
"""CausalConvTranspose1d module with customized initialization."""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding=0):
|
||||
"""Initialize CausalConvTranspose1d module."""
|
||||
super(ConvTranspose1dCausal, self).__init__()
|
||||
self.deconv = weight_norm(
|
||||
ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
|
||||
self.stride = stride
|
||||
self.deconv.apply(init_weights)
|
||||
self.pad = kernel_size - stride
|
||||
|
||||
def forward(self, x):
|
||||
"""Calculate forward propagation.
|
||||
Args:
|
||||
x (Tensor): Input tensor (B, in_channels, T_in).
|
||||
Returns:
|
||||
Tensor: Output tensor (B, out_channels, T_out).
|
||||
"""
|
||||
# x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
|
||||
return self.deconv(x)[:, :, :-self.pad]
|
||||
|
||||
def remove_weight_norm(self):
|
||||
remove_weight_norm(self.deconv)
|
||||
|
||||
|
||||
class ResBlock1(torch.nn.Module):
|
||||
|
||||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
||||
super(ResBlock1, self).__init__()
|
||||
self.h = h
|
||||
self.convs1 = nn.ModuleList([
|
||||
Conv1dCasual(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=dilation[i],
|
||||
padding=get_padding_casual(kernel_size, dilation[i]))
|
||||
for i in range(len(dilation))
|
||||
])
|
||||
|
||||
self.convs2 = nn.ModuleList([
|
||||
Conv1dCasual(
|
||||
channels,
|
||||
channels,
|
||||
kernel_size,
|
||||
1,
|
||||
dilation=1,
|
||||
padding=get_padding_casual(kernel_size, 1))
|
||||
for i in range(len(dilation))
|
||||
])
|
||||
|
||||
def forward(self, x):
|
||||
for c1, c2 in zip(self.convs1, self.convs2):
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
xt = c1(xt)
|
||||
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||
xt = c2(xt)
|
||||
x = xt + x
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for layer in self.convs1:
|
||||
layer.remove_weight_norm()
|
||||
for layer in self.convs2:
|
||||
layer.remove_weight_norm()
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
|
||||
def __init__(self, h):
|
||||
super(Generator, self).__init__()
|
||||
self.h = h
|
||||
self.num_kernels = len(h.resblock_kernel_sizes)
|
||||
self.num_upsamples = len(h.upsample_rates)
|
||||
print('num_kernels={}, num_upsamples={}'.format(
|
||||
self.num_kernels, self.num_upsamples))
|
||||
self.conv_pre = Conv1dCasual(
|
||||
80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
|
||||
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
|
||||
|
||||
self.ups = nn.ModuleList()
|
||||
self.repeat_ups = nn.ModuleList()
|
||||
for i, (u, k) in enumerate(
|
||||
zip(h.upsample_rates, h.upsample_kernel_sizes)):
|
||||
upsample = nn.Sequential(
|
||||
nn.Upsample(mode='nearest', scale_factor=u),
|
||||
nn.LeakyReLU(LRELU_SLOPE),
|
||||
Conv1dCasual(
|
||||
h.upsample_initial_channel // (2**i),
|
||||
h.upsample_initial_channel // (2**(i + 1)),
|
||||
kernel_size=7,
|
||||
stride=1,
|
||||
padding=7 - 1))
|
||||
self.repeat_ups.append(upsample)
|
||||
self.ups.append(
|
||||
ConvTranspose1dCausal(
|
||||
h.upsample_initial_channel // (2**i),
|
||||
h.upsample_initial_channel // (2**(i + 1)),
|
||||
k,
|
||||
u,
|
||||
padding=(k - u) // 2))
|
||||
|
||||
self.resblocks = nn.ModuleList()
|
||||
for i in range(len(self.ups)):
|
||||
ch = h.upsample_initial_channel // (2**(i + 1))
|
||||
for j, (k, d) in enumerate(
|
||||
zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
|
||||
self.resblocks.append(resblock(h, ch, k, d))
|
||||
|
||||
self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv_pre(x)
|
||||
for i in range(self.num_upsamples):
|
||||
x = torch.sin(x) + x
|
||||
# transconv
|
||||
x1 = F.leaky_relu(x, LRELU_SLOPE)
|
||||
x1 = self.ups[i](x1)
|
||||
# repeat
|
||||
x2 = self.repeat_ups[i](x)
|
||||
x = x1 + x2
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||
else:
|
||||
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print('Removing weight norm...')
|
||||
for layer in self.ups:
|
||||
layer.remove_weight_norm()
|
||||
for layer in self.repeat_ups:
|
||||
layer[-1].remove_weight_norm()
|
||||
for layer in self.resblocks:
|
||||
layer.remove_weight_norm()
|
||||
self.conv_pre.remove_weight_norm()
|
||||
self.conv_post.remove_weight_norm()
|
||||
|
||||
|
||||
class DiscriminatorP(torch.nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
period,
|
||||
kernel_size=5,
|
||||
stride=3,
|
||||
use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(
|
||||
Conv2d(
|
||||
1,
|
||||
32, (kernel_size, 1), (stride, 1),
|
||||
padding=(get_padding(5, 1), 0))),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
32,
|
||||
128, (kernel_size, 1), (stride, 1),
|
||||
padding=(get_padding(5, 1), 0))),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
128,
|
||||
512, (kernel_size, 1), (stride, 1),
|
||||
padding=(get_padding(5, 1), 0))),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
512,
|
||||
1024, (kernel_size, 1), (stride, 1),
|
||||
padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
||||
])
|
||||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
# 1d to 2d
|
||||
b, c, t = x.shape
|
||||
if t % self.period != 0: # pad first
|
||||
n_pad = self.period - (t % self.period)
|
||||
x = F.pad(x, (0, n_pad), 'reflect')
|
||||
t = t + n_pad
|
||||
x = x.view(b, c, t // self.period, self.period)
|
||||
|
||||
for layer in self.convs:
|
||||
x = layer(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
self.discriminators = nn.ModuleList([
|
||||
DiscriminatorP(2),
|
||||
DiscriminatorP(3),
|
||||
DiscriminatorP(5),
|
||||
DiscriminatorP(7),
|
||||
DiscriminatorP(11),
|
||||
])
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
class DiscriminatorS(torch.nn.Module):
|
||||
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
||||
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
||||
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||
])
|
||||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
for layer in self.convs:
|
||||
x = layer(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiScaleDiscriminator(torch.nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(MultiScaleDiscriminator, self).__init__()
|
||||
self.discriminators = nn.ModuleList([
|
||||
DiscriminatorS(use_spectral_norm=True),
|
||||
DiscriminatorS(),
|
||||
DiscriminatorS(),
|
||||
])
|
||||
from pytorch_wavelets import DWT1DForward
|
||||
self.meanpools = nn.ModuleList(
|
||||
[DWT1DForward(wave='db3', J=1),
|
||||
DWT1DForward(wave='db3', J=1)])
|
||||
self.convs = nn.ModuleList([
|
||||
weight_norm(Conv1d(2, 1, 15, 1, padding=7)),
|
||||
weight_norm(Conv1d(2, 1, 15, 1, padding=7))
|
||||
])
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
if i != 0:
|
||||
yl, yh = self.meanpools[i - 1](y)
|
||||
y = torch.cat([yl, yh[0]], dim=1)
|
||||
y = self.convs[i - 1](y)
|
||||
y = F.leaky_relu(y, LRELU_SLOPE)
|
||||
|
||||
yl_hat, yh_hat = self.meanpools[i - 1](y_hat)
|
||||
y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1)
|
||||
y_hat = self.convs[i - 1](y_hat)
|
||||
y_hat = F.leaky_relu(y_hat, LRELU_SLOPE)
|
||||
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
class DiscriminatorSTFT(torch.nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
kernel_size=11,
|
||||
stride=2,
|
||||
use_spectral_norm=False,
|
||||
fft_size=1024,
|
||||
shift_size=120,
|
||||
win_length=600,
|
||||
window='hann_window'):
|
||||
super(DiscriminatorSTFT, self).__init__()
|
||||
self.fft_size = fft_size
|
||||
self.shift_size = shift_size
|
||||
self.win_length = win_length
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(
|
||||
Conv2d(
|
||||
fft_size // 2 + 1,
|
||||
32, (15, 1), (1, 1),
|
||||
padding=(get_padding(15, 1), 0))),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
32,
|
||||
32, (kernel_size, 1), (stride, 1),
|
||||
padding=(get_padding(9, 1), 0))),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
32,
|
||||
32, (kernel_size, 1), (stride, 1),
|
||||
padding=(get_padding(9, 1), 0))),
|
||||
norm_f(
|
||||
Conv2d(
|
||||
32,
|
||||
32, (kernel_size, 1), (stride, 1),
|
||||
padding=(get_padding(9, 1), 0))),
|
||||
norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))),
|
||||
])
|
||||
self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0)))
|
||||
self.register_buffer('window', getattr(torch, window)(win_length))
|
||||
|
||||
def forward(self, wav):
|
||||
wav = torch.squeeze(wav, 1)
|
||||
x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length,
|
||||
self.window)
|
||||
x = torch.transpose(x_mag, 2, 1).unsqueeze(-1)
|
||||
fmap = []
|
||||
for layer in self.convs:
|
||||
x = layer(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = x.squeeze(-1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiSTFTDiscriminator(torch.nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fft_sizes=[1024, 2048, 512],
|
||||
hop_sizes=[120, 240, 50],
|
||||
win_lengths=[600, 1200, 240],
|
||||
window='hann_window',
|
||||
):
|
||||
super(MultiSTFTDiscriminator, self).__init__()
|
||||
self.discriminators = nn.ModuleList()
|
||||
for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
|
||||
self.discriminators += [
|
||||
DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl)
|
||||
]
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
def feature_loss(fmap_r, fmap_g):
|
||||
loss = 0
|
||||
for dr, dg in zip(fmap_r, fmap_g):
|
||||
for rl, gl in zip(dr, dg):
|
||||
loss += torch.mean(torch.abs(rl - gl))
|
||||
|
||||
return loss * 2
|
||||
|
||||
|
||||
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
||||
loss = 0
|
||||
r_losses = []
|
||||
g_losses = []
|
||||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
||||
r_loss = torch.mean((1 - dr)**2)
|
||||
g_loss = torch.mean(dg**2)
|
||||
loss += (r_loss + g_loss)
|
||||
r_losses.append(r_loss.item())
|
||||
g_losses.append(g_loss.item())
|
||||
|
||||
return loss, r_losses, g_losses
|
||||
|
||||
|
||||
def generator_loss(disc_outputs):
|
||||
loss = 0
|
||||
gen_losses = []
|
||||
for dg in disc_outputs:
|
||||
temp_loss = torch.mean((1 - dg)**2)
|
||||
gen_losses.append(temp_loss)
|
||||
loss += temp_loss
|
||||
|
||||
return loss, gen_losses
|
||||
59
modelscope/models/audio/tts/vocoder/models/utils.py
Executable file
59
modelscope/models/audio/tts/vocoder/models/utils.py
Executable file
@@ -0,0 +1,59 @@
|
||||
import glob
|
||||
import os
|
||||
|
||||
import matplotlib
|
||||
import matplotlib.pylab as plt
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
|
||||
matplotlib.use('Agg')
|
||||
|
||||
|
||||
def plot_spectrogram(spectrogram):
|
||||
fig, ax = plt.subplots(figsize=(10, 2))
|
||||
im = ax.imshow(
|
||||
spectrogram, aspect='auto', origin='lower', interpolation='none')
|
||||
plt.colorbar(im, ax=ax)
|
||||
|
||||
fig.canvas.draw()
|
||||
plt.close()
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find('Conv') != -1:
|
||||
m.weight.data.normal_(mean, std)
|
||||
|
||||
|
||||
def apply_weight_norm(m):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find('Conv') != -1:
|
||||
weight_norm(m)
|
||||
|
||||
|
||||
def get_padding(kernel_size, dilation=1):
|
||||
return int((kernel_size * dilation - dilation) / 2)
|
||||
|
||||
|
||||
def load_checkpoint(filepath, device):
|
||||
assert os.path.isfile(filepath)
|
||||
print("Loading '{}'".format(filepath))
|
||||
checkpoint_dict = torch.load(filepath, map_location=device)
|
||||
print('Complete.')
|
||||
return checkpoint_dict
|
||||
|
||||
|
||||
def save_checkpoint(filepath, obj):
|
||||
print('Saving checkpoint to {}'.format(filepath))
|
||||
torch.save(obj, filepath)
|
||||
print('Complete.')
|
||||
|
||||
|
||||
def scan_checkpoint(cp_dir, prefix):
|
||||
pattern = os.path.join(cp_dir, prefix + '????????')
|
||||
cp_list = glob.glob(pattern)
|
||||
if len(cp_list) == 0:
|
||||
return None
|
||||
return sorted(cp_list)[-1]
|
||||
@@ -62,4 +62,6 @@ class Model(ABC):
|
||||
if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
|
||||
model_cfg.type = model_cfg.model_type
|
||||
model_cfg.model_dir = local_model_dir
|
||||
for k, v in kwargs.items():
|
||||
model_cfg.k = v
|
||||
return build_model(model_cfg, task_name)
|
||||
|
||||
1
modelscope/models/multi_model/__init__.py
Normal file
1
modelscope/models/multi_model/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .image_captioning_model import OfaForImageCaptioning
|
||||
80
modelscope/models/multi_model/image_captioning_model.py
Normal file
80
modelscope/models/multi_model/image_captioning_model.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import os.path as osp
|
||||
from typing import Any, Dict
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from ..base import Model
|
||||
from ..builder import MODELS
|
||||
|
||||
__all__ = ['OfaForImageCaptioning']
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.image_captioning, module_name=r'ofa-image-captioning')
|
||||
class OfaForImageCaptioning(Model):
|
||||
|
||||
def __init__(self, model_dir, *args, **kwargs):
|
||||
super().__init__(model_dir=model_dir, *args, **kwargs)
|
||||
ckpt_name = ModelFile.TORCH_MODEL_FILE
|
||||
local_model = osp.join(model_dir, ckpt_name)
|
||||
bpe_dir = model_dir
|
||||
# turn on cuda if GPU is available
|
||||
from fairseq import checkpoint_utils, tasks, utils
|
||||
from ofa.tasks.mm_tasks import CaptionTask
|
||||
from ofa.utils.eval_utils import eval_caption
|
||||
self.eval_caption = eval_caption
|
||||
|
||||
tasks.register_task('caption', CaptionTask)
|
||||
use_cuda = kwargs['use_cuda'] if 'use_cuda' in kwargs else False
|
||||
use_fp16 = kwargs[
|
||||
'use_fp16'] if 'use_fp16' in kwargs and use_cuda else False
|
||||
overrides = {
|
||||
'bpe_dir': bpe_dir,
|
||||
'eval_cider': False,
|
||||
'beam': 5,
|
||||
'max_len_b': 16,
|
||||
'no_repeat_ngram_size': 3,
|
||||
'seed': 7
|
||||
}
|
||||
models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
utils.split_paths(local_model), arg_overrides=overrides)
|
||||
|
||||
# Move models to GPU
|
||||
for model in models:
|
||||
model.eval()
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
if use_fp16:
|
||||
model.half()
|
||||
model.prepare_for_inference_(cfg)
|
||||
self.models = models
|
||||
# Initialize generator
|
||||
self.generator = task.build_generator(models, cfg.generation)
|
||||
|
||||
# Initialize transform
|
||||
from torchvision import transforms
|
||||
mean = [0.5, 0.5, 0.5]
|
||||
std = [0.5, 0.5, 0.5]
|
||||
|
||||
self.patch_resize_transform = transforms.Compose([
|
||||
lambda image: image.convert('RGB'),
|
||||
transforms.Resize(
|
||||
(cfg.task.patch_image_size, cfg.task.patch_image_size),
|
||||
interpolation=Image.BICUBIC),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=mean, std=std),
|
||||
])
|
||||
self.task = task
|
||||
|
||||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||
results, _ = self.eval_caption(self.task, self.generator, self.models,
|
||||
input)
|
||||
return {
|
||||
'image_id': results[0]['image_id'],
|
||||
'caption': results[0]['caption']
|
||||
}
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
# What should we do here ?
|
||||
return inputs
|
||||
@@ -1,4 +1,5 @@
|
||||
from .sentence_similarity_model import * # noqa F403
|
||||
from .sequence_classification_model import * # noqa F403
|
||||
from .text_generation_model import * # noqa F403
|
||||
from .bert_for_sequence_classification import * # noqa F403
|
||||
from .palm_for_text_generation import * # noqa F403
|
||||
from .sbert_for_sentence_similarity import * # noqa F403
|
||||
from .sbert_for_token_classification import * # noqa F403
|
||||
from .zero_shot_classification_model import * # noqa F403
|
||||
|
||||
43
modelscope/models/nlp/palm_for_text_generation.py
Normal file
43
modelscope/models/nlp/palm_for_text_generation.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import Dict
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Model, Tensor
|
||||
from ..builder import MODELS
|
||||
|
||||
__all__ = ['PalmForTextGeneration']
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0')
|
||||
class PalmForTextGeneration(Model):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the text generation model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
model_cls (Optional[Any], optional): model loader, if None, use the
|
||||
default loader to load model weights, by default None.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
|
||||
from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
|
||||
model = PalmForConditionalGeneration.from_pretrained(model_dir)
|
||||
self.tokenizer = model.tokenizer
|
||||
self.generator = Translator(model)
|
||||
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Tensor]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, Tensor]: results
|
||||
Example:
|
||||
{
|
||||
'predictions': Tensor([[1377, 4959, 2785, 6392...])]), # tokens need to be decode by tokenizer
|
||||
}
|
||||
"""
|
||||
|
||||
return self.generator(**input)
|
||||
56
modelscope/models/nlp/sbert_for_token_classification.py
Normal file
56
modelscope/models/nlp/sbert_for_token_classification.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from sofa import SbertConfig, SbertForTokenClassification
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Model, Tensor
|
||||
from ..builder import MODELS
|
||||
|
||||
__all__ = ['StructBertForTokenClassification']
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.word_segmentation,
|
||||
module_name=r'structbert-chinese-word-segmentation')
|
||||
class StructBertForTokenClassification(Model):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the word segmentation model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
model_cls (Optional[Any], optional): model loader, if None, use the
|
||||
default loader to load model weights, by default None.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
self.model = SbertForTokenClassification.from_pretrained(
|
||||
self.model_dir)
|
||||
self.config = SbertConfig.from_pretrained(self.model_dir)
|
||||
|
||||
def forward(self, input: Dict[str,
|
||||
Any]) -> Dict[str, Union[str, np.ndarray]]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Any]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, Union[str,np.ndarray]]: results
|
||||
Example:
|
||||
{
|
||||
'predictions': array([1,4]), # lable 0-negative 1-positive
|
||||
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
|
||||
'text': str(今天),
|
||||
}
|
||||
"""
|
||||
input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
|
||||
output = self.model(input_ids)
|
||||
logits = output.logits
|
||||
pred = torch.argmax(logits[0], dim=-1)
|
||||
pred = pred.numpy()
|
||||
|
||||
rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
|
||||
return rst
|
||||
@@ -1,52 +0,0 @@
|
||||
from typing import Any, Dict
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Model, Tensor
|
||||
from ..builder import MODELS
|
||||
|
||||
__all__ = ['PalmForTextGenerationModel']
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.text_generation, module_name=r'palm')
|
||||
class PalmForTextGenerationModel(Model):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the text generation model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
model_cls (Optional[Any], optional): model loader, if None, use the
|
||||
default loader to load model weights, by default None.
|
||||
"""
|
||||
from sofa import PalmTokenizer
|
||||
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
|
||||
from sofa.models.palm import PalmForConditionalGeneration, TextGenerator
|
||||
tokenizer = kwargs.pop('tokenizer',
|
||||
PalmTokenizer.from_pretrained(model_dir))
|
||||
model = PalmForConditionalGeneration.from_pretrained(model_dir)
|
||||
self.generator = TextGenerator(model, tokenizer)
|
||||
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Any]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, np.ndarray]: results
|
||||
Example:
|
||||
{
|
||||
'predictions': array([1]), # lable 0-negative 1-positive
|
||||
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
|
||||
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
|
||||
}
|
||||
"""
|
||||
|
||||
encoder_inputs = [
|
||||
input['input_ids'], input['token_type_ids'],
|
||||
input['attention_mask']
|
||||
]
|
||||
return self.generator(encoder_inputs)
|
||||
@@ -1,4 +1,4 @@
|
||||
from .audio import * # noqa F403
|
||||
from .audio import LinearAECPipeline
|
||||
from .base import Pipeline
|
||||
from .builder import pipeline
|
||||
from .cv import * # noqa F403
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
from .linear_aec_pipeline import LinearAECPipeline
|
||||
from .text_to_speech_pipeline import * # noqa F403
|
||||
|
||||
160
modelscope/pipelines/audio/linear_aec_pipeline.py
Normal file
160
modelscope/pipelines/audio/linear_aec_pipeline.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import importlib
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
import scipy.io.wavfile as wav
|
||||
import torch
|
||||
import yaml
|
||||
|
||||
from modelscope.preprocessors.audio import LinearAECAndFbank
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from ..base import Pipeline
|
||||
from ..builder import PIPELINES
|
||||
|
||||
FEATURE_MVN = 'feature.DEY.mvn.txt'
|
||||
|
||||
CONFIG_YAML = 'dey_mini.yaml'
|
||||
|
||||
|
||||
def initialize_config(module_cfg):
|
||||
r"""According to config items, load specific module dynamically with params.
|
||||
1. Load the module corresponding to the "module" param.
|
||||
2. Call function (or instantiate class) corresponding to the "main" param.
|
||||
3. Send the param (in "args") into the function (or class) when calling ( or instantiating).
|
||||
|
||||
Args:
|
||||
module_cfg (dict): config items, eg:
|
||||
{
|
||||
"module": "models.model",
|
||||
"main": "Model",
|
||||
"args": {...}
|
||||
}
|
||||
|
||||
Returns:
|
||||
the module loaded.
|
||||
"""
|
||||
module = importlib.import_module(module_cfg['module'])
|
||||
return getattr(module, module_cfg['main'])(**module_cfg['args'])
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k')
|
||||
class LinearAECPipeline(Pipeline):
|
||||
r"""AEC Inference Pipeline only support 16000 sample rate.
|
||||
|
||||
When invoke the class with pipeline.__call__(), you should provide two params:
|
||||
Dict[str, Any]
|
||||
the path of wav files,eg:{
|
||||
"nearend_mic": "/your/data/near_end_mic_audio.wav",
|
||||
"farend_speech": "/your/data/far_end_speech_audio.wav"}
|
||||
output_path (str, optional): "/your/output/audio_after_aec.wav"
|
||||
the file path to write generate audio.
|
||||
"""
|
||||
|
||||
def __init__(self, model):
|
||||
r"""
|
||||
Args:
|
||||
model: model id on modelscope hub.
|
||||
"""
|
||||
super().__init__(model=model)
|
||||
self.use_cuda = torch.cuda.is_available()
|
||||
with open(
|
||||
os.path.join(self.model, CONFIG_YAML), encoding='utf-8') as f:
|
||||
self.config = yaml.full_load(f.read())
|
||||
self.config['io']['mvn'] = os.path.join(self.model, FEATURE_MVN)
|
||||
self._init_model()
|
||||
self.preprocessor = LinearAECAndFbank(self.config['io'])
|
||||
|
||||
n_fft = self.config['loss']['args']['n_fft']
|
||||
hop_length = self.config['loss']['args']['hop_length']
|
||||
winlen = n_fft
|
||||
window = torch.hamming_window(winlen, periodic=False)
|
||||
|
||||
def stft(x):
|
||||
return torch.stft(
|
||||
x,
|
||||
n_fft,
|
||||
hop_length,
|
||||
winlen,
|
||||
center=False,
|
||||
window=window.to(x.device),
|
||||
return_complex=False)
|
||||
|
||||
def istft(x, slen):
|
||||
return torch.istft(
|
||||
x,
|
||||
n_fft,
|
||||
hop_length,
|
||||
winlen,
|
||||
window=window.to(x.device),
|
||||
center=False,
|
||||
length=slen)
|
||||
|
||||
self.stft = stft
|
||||
self.istft = istft
|
||||
|
||||
def _init_model(self):
|
||||
checkpoint = torch.load(
|
||||
os.path.join(self.model, ModelFile.TORCH_MODEL_BIN_FILE),
|
||||
map_location='cpu')
|
||||
self.model = initialize_config(self.config['nnet'])
|
||||
if self.use_cuda:
|
||||
self.model = self.model.cuda()
|
||||
self.model.load_state_dict(checkpoint)
|
||||
|
||||
def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
r"""The AEC process.
|
||||
|
||||
Args:
|
||||
inputs: dict={'feature': Tensor, 'base': Tensor}
|
||||
'feature' feature of input audio.
|
||||
'base' the base audio to mask.
|
||||
|
||||
Returns:
|
||||
dict:
|
||||
{
|
||||
'output_pcm': generated audio array
|
||||
}
|
||||
"""
|
||||
output_data = self._process(inputs['feature'], inputs['base'])
|
||||
return {'output_pcm': output_data}
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
||||
r"""The post process. Will save audio to file, if the output_path is given.
|
||||
|
||||
Args:
|
||||
inputs: dict:
|
||||
{
|
||||
'output_pcm': generated audio array
|
||||
}
|
||||
kwargs: accept 'output_path' which is the path to write generated audio
|
||||
|
||||
Returns:
|
||||
dict:
|
||||
{
|
||||
'output_pcm': generated audio array
|
||||
}
|
||||
"""
|
||||
if 'output_path' in kwargs.keys():
|
||||
wav.write(kwargs['output_path'], self.preprocessor.SAMPLE_RATE,
|
||||
inputs['output_pcm'].astype(np.int16))
|
||||
inputs['output_pcm'] = inputs['output_pcm'] / 32768.0
|
||||
return inputs
|
||||
|
||||
def _process(self, fbanks, mixture):
|
||||
if self.use_cuda:
|
||||
fbanks = fbanks.cuda()
|
||||
mixture = mixture.cuda()
|
||||
if self.model.vad:
|
||||
with torch.no_grad():
|
||||
masks, vad = self.model(fbanks.unsqueeze(0))
|
||||
masks = masks.permute([2, 1, 0])
|
||||
else:
|
||||
with torch.no_grad():
|
||||
masks = self.model(fbanks.unsqueeze(0))
|
||||
masks = masks.permute([2, 1, 0])
|
||||
spectrum = self.stft(mixture)
|
||||
masked_spec = spectrum * masks
|
||||
masked_sig = self.istft(masked_spec, len(mixture)).cpu().numpy()
|
||||
return masked_sig
|
||||
46
modelscope/pipelines/audio/text_to_speech_pipeline.py
Normal file
46
modelscope/pipelines/audio/text_to_speech_pipeline.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import time
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.audio.tts.am import SambertNetHifi16k
|
||||
from modelscope.models.audio.tts.vocoder import Hifigan16k
|
||||
from modelscope.pipelines.base import Pipeline
|
||||
from modelscope.pipelines.builder import PIPELINES
|
||||
from modelscope.preprocessors import TextToTacotronSymbols, build_preprocessor
|
||||
from modelscope.utils.constant import Fields, Tasks
|
||||
|
||||
__all__ = ['TextToSpeechSambertHifigan16kPipeline']
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.text_to_speech, module_name=r'tts-sambert-hifigan-16k')
|
||||
class TextToSpeechSambertHifigan16kPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
config_file: str = None,
|
||||
model: List[Model] = None,
|
||||
preprocessor: TextToTacotronSymbols = None,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
config_file=config_file,
|
||||
model=model,
|
||||
preprocessor=preprocessor,
|
||||
**kwargs)
|
||||
assert len(model) == 2, 'model number should be 2'
|
||||
self._am = model[0]
|
||||
self._vocoder = model[1]
|
||||
self._preprocessor = preprocessor
|
||||
|
||||
def forward(self, inputs: Dict[str, Any]) -> Dict[str, np.ndarray]:
|
||||
texts = inputs['texts']
|
||||
audio_total = np.empty((0), dtype='int16')
|
||||
for line in texts:
|
||||
line = line.strip().split('\t')
|
||||
audio = self._vocoder.forward(self._am.forward(line[1]))
|
||||
audio_total = np.append(audio_total, audio, axis=0)
|
||||
return {'output': audio_total}
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return inputs
|
||||
@@ -13,20 +13,26 @@ PIPELINES = Registry('pipelines')
|
||||
|
||||
DEFAULT_MODEL_FOR_PIPELINE = {
|
||||
# TaskName: (pipeline_module_name, model_repo)
|
||||
Tasks.word_segmentation:
|
||||
('structbert-chinese-word-segmentation',
|
||||
'damo/nlp_structbert_word-segmentation_chinese-base'),
|
||||
Tasks.sentence_similarity:
|
||||
('sbert-base-chinese-sentence-similarity',
|
||||
'damo/nlp_structbert_sentence-similarity_chinese-base'),
|
||||
Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting_damo'),
|
||||
Tasks.text_classification:
|
||||
('bert-sentiment-analysis', 'damo/bert-base-sst2'),
|
||||
Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
|
||||
Tasks.text_classification: ('bert-sentiment-analysis',
|
||||
'damo/bert-base-sst2'),
|
||||
Tasks.zero_shot_classification:
|
||||
('bert-zero-shot-classification',
|
||||
'damo/nlp_structbert_zero-shot-classification_chinese-base'),
|
||||
Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'),
|
||||
Tasks.image_captioning: ('ofa', None),
|
||||
Tasks.text_generation: ('palm2.0',
|
||||
'damo/nlp_palm2.0_text-generation_chinese-base'),
|
||||
Tasks.image_captioning: ('ofa', 'damo/ofa_image-caption_coco_large_en'),
|
||||
Tasks.image_generation:
|
||||
('person-image-cartoon',
|
||||
'damo/cv_unet_person-image-cartoon_compound-models'),
|
||||
Tasks.ocr_detection: ('ocr-detection',
|
||||
'damo/cv_resnet18_ocr-detection-line-level_damo'),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
from .image_cartoon_pipeline import ImageCartoonPipeline
|
||||
from .image_matting_pipeline import ImageMattingPipeline
|
||||
from .ocr_detection_pipeline import OCRDetectionPipeline
|
||||
|
||||
167
modelscope/pipelines/cv/ocr_detection_pipeline.py
Normal file
167
modelscope/pipelines/cv/ocr_detection_pipeline.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import math
|
||||
import os
|
||||
import os.path as osp
|
||||
import sys
|
||||
from typing import Any, Dict, List, Tuple, Union
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import PIL
|
||||
import tensorflow as tf
|
||||
import tf_slim as slim
|
||||
|
||||
from modelscope.pipelines.base import Input
|
||||
from modelscope.preprocessors import load_image
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from ..base import Pipeline
|
||||
from ..builder import PIPELINES
|
||||
from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils
|
||||
|
||||
if tf.__version__ >= '2.0':
|
||||
tf = tf.compat.v1
|
||||
tf.compat.v1.disable_eager_execution()
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
# constant
|
||||
RBOX_DIM = 5
|
||||
OFFSET_DIM = 6
|
||||
WORD_POLYGON_DIM = 8
|
||||
OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
|
||||
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
tf.app.flags.DEFINE_float('node_threshold', 0.4,
|
||||
'Confidence threshold for nodes')
|
||||
tf.app.flags.DEFINE_float('link_threshold', 0.6,
|
||||
'Confidence threshold for links')
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.ocr_detection, module_name=Tasks.ocr_detection)
|
||||
class OCRDetectionPipeline(Pipeline):
|
||||
|
||||
def __init__(self, model: str):
|
||||
super().__init__(model=model)
|
||||
model_path = osp.join(
|
||||
osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
|
||||
'checkpoint-80000')
|
||||
|
||||
config = tf.ConfigProto(allow_soft_placement=True)
|
||||
config.gpu_options.allow_growth = True
|
||||
self._session = tf.Session(config=config)
|
||||
global_step = tf.get_variable(
|
||||
'global_step', [],
|
||||
initializer=tf.constant_initializer(0),
|
||||
dtype=tf.int64,
|
||||
trainable=False)
|
||||
variable_averages = tf.train.ExponentialMovingAverage(
|
||||
0.997, global_step)
|
||||
self.input_images = tf.placeholder(
|
||||
tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
|
||||
self.output = {}
|
||||
|
||||
# detector
|
||||
detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector()
|
||||
all_maps = detector.build_model(self.input_images, is_training=False)
|
||||
|
||||
# decode local predictions
|
||||
all_nodes, all_links, all_reg = [], [], []
|
||||
for i, maps in enumerate(all_maps):
|
||||
cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2]
|
||||
reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
|
||||
|
||||
cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))
|
||||
|
||||
lnk_prob_pos = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, :2])
|
||||
lnk_prob_mut = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, 2:])
|
||||
lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1)
|
||||
|
||||
all_nodes.append(cls_prob)
|
||||
all_links.append(lnk_prob)
|
||||
all_reg.append(reg_maps)
|
||||
|
||||
# decode segments and links
|
||||
image_size = tf.shape(self.input_images)[1:3]
|
||||
segments, group_indices, segment_counts, _ = ops.decode_segments_links_python(
|
||||
image_size,
|
||||
all_nodes,
|
||||
all_links,
|
||||
all_reg,
|
||||
anchor_sizes=list(detector.anchor_sizes))
|
||||
|
||||
# combine segments
|
||||
combined_rboxes, combined_counts = ops.combine_segments_python(
|
||||
segments, group_indices, segment_counts)
|
||||
self.output['combined_rboxes'] = combined_rboxes
|
||||
self.output['combined_counts'] = combined_counts
|
||||
|
||||
with self._session.as_default() as sess:
|
||||
logger.info(f'loading model from {model_path}')
|
||||
# load model
|
||||
model_loader = tf.train.Saver(
|
||||
variable_averages.variables_to_restore())
|
||||
model_loader.restore(sess, model_path)
|
||||
|
||||
def preprocess(self, input: Input) -> Dict[str, Any]:
|
||||
if isinstance(input, str):
|
||||
img = np.array(load_image(input))
|
||||
elif isinstance(input, PIL.Image.Image):
|
||||
img = np.array(input.convert('RGB'))
|
||||
elif isinstance(input, np.ndarray):
|
||||
if len(input.shape) == 2:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
img = input[:, :, ::-1] # in rgb order
|
||||
else:
|
||||
raise TypeError(f'input should be either str, PIL.Image,'
|
||||
f' np.array, but got {type(input)}')
|
||||
h, w, c = img.shape
|
||||
img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
|
||||
img_pad[:h, :w, :] = img
|
||||
|
||||
resize_size = 1024
|
||||
img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
|
||||
img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
|
||||
img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94],
|
||||
dtype=np.float32)
|
||||
|
||||
resize_size = tf.stack([resize_size, resize_size])
|
||||
orig_size = tf.stack([max(h, w), max(h, w)])
|
||||
self.output['orig_size'] = orig_size
|
||||
self.output['resize_size'] = resize_size
|
||||
|
||||
result = {'img': np.expand_dims(img_pad_resize, axis=0)}
|
||||
return result
|
||||
|
||||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||
with self._session.as_default():
|
||||
feed_dict = {self.input_images: input['img']}
|
||||
sess_outputs = self._session.run(self.output, feed_dict=feed_dict)
|
||||
return sess_outputs
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
rboxes = inputs['combined_rboxes'][0]
|
||||
count = inputs['combined_counts'][0]
|
||||
rboxes = rboxes[:count, :]
|
||||
|
||||
# convert rboxes to polygons and find its coordinates on the original image
|
||||
orig_h, orig_w = inputs['orig_size']
|
||||
resize_h, resize_w = inputs['resize_size']
|
||||
polygons = utils.rboxes_to_polygons(rboxes)
|
||||
scale_y = float(orig_h) / float(resize_h)
|
||||
scale_x = float(orig_w) / float(resize_w)
|
||||
|
||||
# confine polygons inside image
|
||||
polygons[:, ::2] = np.maximum(
|
||||
0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
|
||||
polygons[:, 1::2] = np.maximum(
|
||||
0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
|
||||
polygons = np.round(polygons).astype(np.int32)
|
||||
|
||||
# nms
|
||||
dt_n9 = [o + [utils.cal_width(o)] for o in polygons.tolist()]
|
||||
dt_nms = utils.nms_python(dt_n9)
|
||||
dt_polygons = np.array([o[:8] for o in dt_nms])
|
||||
|
||||
result = {'det_polygons': dt_polygons}
|
||||
return result
|
||||
0
modelscope/pipelines/cv/ocr_utils/__init__.py
Normal file
0
modelscope/pipelines/cv/ocr_utils/__init__.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import tensorflow as tf
|
||||
import tf_slim as slim
|
||||
|
||||
from . import ops, resnet18_v1, resnet_utils
|
||||
|
||||
if tf.__version__ >= '2.0':
|
||||
tf = tf.compat.v1
|
||||
|
||||
# constants
|
||||
OFFSET_DIM = 6
|
||||
|
||||
N_LOCAL_LINKS = 8
|
||||
N_CROSS_LINKS = 4
|
||||
N_SEG_CLASSES = 2
|
||||
N_LNK_CLASSES = 4
|
||||
|
||||
POS_LABEL = 1
|
||||
NEG_LABEL = 0
|
||||
|
||||
|
||||
class SegLinkDetector():
|
||||
|
||||
def __init__(self):
|
||||
self.anchor_sizes = [6., 11.84210526, 23.68421053, 45., 90., 150.]
|
||||
|
||||
def _detection_classifier(self,
|
||||
maps,
|
||||
ksize,
|
||||
weight_decay,
|
||||
cross_links=False,
|
||||
scope=None):
|
||||
|
||||
with tf.variable_scope(scope):
|
||||
seg_depth = N_SEG_CLASSES
|
||||
if cross_links:
|
||||
lnk_depth = N_LNK_CLASSES * (N_LOCAL_LINKS + N_CROSS_LINKS)
|
||||
else:
|
||||
lnk_depth = N_LNK_CLASSES * N_LOCAL_LINKS
|
||||
reg_depth = OFFSET_DIM
|
||||
map_depth = maps.get_shape()[3]
|
||||
inter_maps, inter_relu = ops.conv2d(
|
||||
maps, map_depth, 256, 1, 1, 'SAME', scope='conv_inter')
|
||||
|
||||
dir_maps, dir_relu = ops.conv2d(
|
||||
inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_dir')
|
||||
cen_maps, cen_relu = ops.conv2d(
|
||||
inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_cen')
|
||||
pol_maps, pol_relu = ops.conv2d(
|
||||
inter_relu, 256, 8, ksize, 1, 'SAME', scope='conv_pol')
|
||||
concat_relu = tf.concat([dir_relu, cen_relu, pol_relu], axis=-1)
|
||||
_, lnk_embedding = ops.conv_relu(
|
||||
concat_relu, 12, 256, 1, 1, scope='lnk_embedding')
|
||||
lnk_maps, lnk_relu = ops.conv2d(
|
||||
inter_relu + lnk_embedding,
|
||||
256,
|
||||
lnk_depth,
|
||||
ksize,
|
||||
1,
|
||||
'SAME',
|
||||
scope='conv_lnk')
|
||||
|
||||
char_seg_maps, char_seg_relu = ops.conv2d(
|
||||
inter_relu,
|
||||
256,
|
||||
seg_depth,
|
||||
ksize,
|
||||
1,
|
||||
'SAME',
|
||||
scope='conv_char_cls')
|
||||
char_reg_maps, char_reg_relu = ops.conv2d(
|
||||
inter_relu,
|
||||
256,
|
||||
reg_depth,
|
||||
ksize,
|
||||
1,
|
||||
'SAME',
|
||||
scope='conv_char_reg')
|
||||
concat_char_relu = tf.concat([char_seg_relu, char_reg_relu],
|
||||
axis=-1)
|
||||
_, char_embedding = ops.conv_relu(
|
||||
concat_char_relu, 8, 256, 1, 1, scope='conv_char_embedding')
|
||||
seg_maps, seg_relu = ops.conv2d(
|
||||
inter_relu + char_embedding,
|
||||
256,
|
||||
seg_depth,
|
||||
ksize,
|
||||
1,
|
||||
'SAME',
|
||||
scope='conv_cls')
|
||||
reg_maps, reg_relu = ops.conv2d(
|
||||
inter_relu + char_embedding,
|
||||
256,
|
||||
reg_depth,
|
||||
ksize,
|
||||
1,
|
||||
'SAME',
|
||||
scope='conv_reg')
|
||||
|
||||
return seg_relu, lnk_relu, reg_relu
|
||||
|
||||
def _build_cnn(self, images, weight_decay, is_training):
|
||||
with slim.arg_scope(
|
||||
resnet18_v1.resnet_arg_scope(weight_decay=weight_decay)):
|
||||
logits, end_points = resnet18_v1.resnet_v1_18(
|
||||
images, is_training=is_training, scope='resnet_v1_18')
|
||||
|
||||
outputs = {
|
||||
'conv3_3': end_points['pool1'],
|
||||
'conv4_3': end_points['pool2'],
|
||||
'fc7': end_points['pool3'],
|
||||
'conv8_2': end_points['pool4'],
|
||||
'conv9_2': end_points['pool5'],
|
||||
'conv10_2': end_points['pool6'],
|
||||
}
|
||||
return outputs
|
||||
|
||||
def build_model(self, images, is_training=True, scope=None):
|
||||
|
||||
weight_decay = 5e-4 # FLAGS.weight_decay
|
||||
cnn_outputs = self._build_cnn(images, weight_decay, is_training)
|
||||
det_0 = self._detection_classifier(
|
||||
cnn_outputs['conv3_3'],
|
||||
3,
|
||||
weight_decay,
|
||||
cross_links=False,
|
||||
scope='dete_0')
|
||||
det_1 = self._detection_classifier(
|
||||
cnn_outputs['conv4_3'],
|
||||
3,
|
||||
weight_decay,
|
||||
cross_links=True,
|
||||
scope='dete_1')
|
||||
det_2 = self._detection_classifier(
|
||||
cnn_outputs['fc7'],
|
||||
3,
|
||||
weight_decay,
|
||||
cross_links=True,
|
||||
scope='dete_2')
|
||||
det_3 = self._detection_classifier(
|
||||
cnn_outputs['conv8_2'],
|
||||
3,
|
||||
weight_decay,
|
||||
cross_links=True,
|
||||
scope='dete_3')
|
||||
det_4 = self._detection_classifier(
|
||||
cnn_outputs['conv9_2'],
|
||||
3,
|
||||
weight_decay,
|
||||
cross_links=True,
|
||||
scope='dete_4')
|
||||
det_5 = self._detection_classifier(
|
||||
cnn_outputs['conv10_2'],
|
||||
3,
|
||||
weight_decay,
|
||||
cross_links=True,
|
||||
scope='dete_5')
|
||||
outputs = [det_0, det_1, det_2, det_3, det_4, det_5]
|
||||
return outputs
|
||||
1098
modelscope/pipelines/cv/ocr_utils/ops.py
Normal file
1098
modelscope/pipelines/cv/ocr_utils/ops.py
Normal file
File diff suppressed because it is too large
Load Diff
432
modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
Normal file
432
modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
Normal file
@@ -0,0 +1,432 @@
|
||||
"""Contains definitions for the original form of Residual Networks.
|
||||
The 'v1' residual networks (ResNets) implemented in this module were proposed
|
||||
by:
|
||||
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
|
||||
Deep Residual Learning for Image Recognition. arXiv:1512.03385
|
||||
Other variants were introduced in:
|
||||
[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
|
||||
Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
|
||||
The networks defined in this module utilize the bottleneck building block of
|
||||
[1] with projection shortcuts only for increasing depths. They employ batch
|
||||
normalization *after* every weight layer. This is the architecture used by
|
||||
MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and
|
||||
ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1'
|
||||
architecture and the alternative 'v2' architecture of [2] which uses batch
|
||||
normalization *before* every weight layer in the so-called full pre-activation
|
||||
units.
|
||||
Typical use:
|
||||
from tensorflow.contrib.slim.nets import resnet_v1
|
||||
ResNet-101 for image classification into 1000 classes:
|
||||
# inputs has shape [batch, 224, 224, 3]
|
||||
with slim.arg_scope(resnet_v1.resnet_arg_scope()):
|
||||
net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
|
||||
ResNet-101 for semantic segmentation into 21 classes:
|
||||
# inputs has shape [batch, 513, 513, 3]
|
||||
with slim.arg_scope(resnet_v1.resnet_arg_scope()):
|
||||
net, end_points = resnet_v1.resnet_v1_101(inputs,
|
||||
21,
|
||||
is_training=False,
|
||||
global_pool=False,
|
||||
output_stride=16)
|
||||
"""
|
||||
import tensorflow as tf
|
||||
import tf_slim as slim
|
||||
|
||||
from . import resnet_utils
|
||||
|
||||
if tf.__version__ >= '2.0':
|
||||
tf = tf.compat.v1
|
||||
|
||||
resnet_arg_scope = resnet_utils.resnet_arg_scope
|
||||
|
||||
|
||||
@slim.add_arg_scope
|
||||
def basicblock(inputs,
|
||||
depth,
|
||||
depth_bottleneck,
|
||||
stride,
|
||||
rate=1,
|
||||
outputs_collections=None,
|
||||
scope=None):
|
||||
"""Bottleneck residual unit variant with BN after convolutions.
|
||||
This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
|
||||
its definition. Note that we use here the bottleneck variant which has an
|
||||
extra bottleneck layer.
|
||||
When putting together two consecutive ResNet blocks that use this unit, one
|
||||
should use stride = 2 in the last unit of the first block.
|
||||
Args:
|
||||
inputs: A tensor of size [batch, height, width, channels].
|
||||
depth: The depth of the ResNet unit output.
|
||||
depth_bottleneck: The depth of the bottleneck layers.
|
||||
stride: The ResNet unit's stride. Determines the amount of downsampling of
|
||||
the units output compared to its input.
|
||||
rate: An integer, rate for atrous convolution.
|
||||
outputs_collections: Collection to add the ResNet unit output.
|
||||
scope: Optional variable_scope.
|
||||
Returns:
|
||||
The ResNet unit's output.
|
||||
"""
|
||||
with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
|
||||
depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
|
||||
if depth == depth_in:
|
||||
shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
|
||||
else:
|
||||
shortcut = slim.conv2d(
|
||||
inputs,
|
||||
depth, [1, 1],
|
||||
stride=stride,
|
||||
activation_fn=None,
|
||||
scope='shortcut')
|
||||
|
||||
residual = resnet_utils.conv2d_same(
|
||||
inputs, depth, 3, stride, rate=rate, scope='conv1')
|
||||
residual = resnet_utils.conv2d_same(
|
||||
residual, depth, 3, 1, rate=rate, scope='conv2')
|
||||
|
||||
output = tf.nn.relu(residual + shortcut)
|
||||
|
||||
return slim.utils.collect_named_outputs(outputs_collections,
|
||||
sc.original_name_scope, output)
|
||||
|
||||
|
||||
@slim.add_arg_scope
|
||||
def bottleneck(inputs,
|
||||
depth,
|
||||
depth_bottleneck,
|
||||
stride,
|
||||
rate=1,
|
||||
outputs_collections=None,
|
||||
scope=None):
|
||||
"""Bottleneck residual unit variant with BN after convolutions.
|
||||
This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
|
||||
its definition. Note that we use here the bottleneck variant which has an
|
||||
extra bottleneck layer.
|
||||
When putting together two consecutive ResNet blocks that use this unit, one
|
||||
should use stride = 2 in the last unit of the first block.
|
||||
Args:
|
||||
inputs: A tensor of size [batch, height, width, channels].
|
||||
depth: The depth of the ResNet unit output.
|
||||
depth_bottleneck: The depth of the bottleneck layers.
|
||||
stride: The ResNet unit's stride. Determines the amount of downsampling of
|
||||
the units output compared to its input.
|
||||
rate: An integer, rate for atrous convolution.
|
||||
outputs_collections: Collection to add the ResNet unit output.
|
||||
scope: Optional variable_scope.
|
||||
Returns:
|
||||
The ResNet unit's output.
|
||||
"""
|
||||
with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
|
||||
depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
|
||||
if depth == depth_in:
|
||||
shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
|
||||
else:
|
||||
shortcut = slim.conv2d(
|
||||
inputs,
|
||||
depth, [1, 1],
|
||||
stride=stride,
|
||||
activation_fn=None,
|
||||
scope='shortcut')
|
||||
|
||||
residual = slim.conv2d(
|
||||
inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1')
|
||||
residual = resnet_utils.conv2d_same(
|
||||
residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2')
|
||||
residual = slim.conv2d(
|
||||
residual,
|
||||
depth, [1, 1],
|
||||
stride=1,
|
||||
activation_fn=None,
|
||||
scope='conv3')
|
||||
|
||||
output = tf.nn.relu(shortcut + residual)
|
||||
|
||||
return slim.utils.collect_named_outputs(outputs_collections,
|
||||
sc.original_name_scope, output)
|
||||
|
||||
|
||||
def resnet_v1(inputs,
|
||||
blocks,
|
||||
num_classes=None,
|
||||
is_training=True,
|
||||
global_pool=True,
|
||||
output_stride=None,
|
||||
include_root_block=True,
|
||||
spatial_squeeze=True,
|
||||
reuse=None,
|
||||
scope=None):
|
||||
"""Generator for v1 ResNet models.
|
||||
This function generates a family of ResNet v1 models. See the resnet_v1_*()
|
||||
methods for specific model instantiations, obtained by selecting different
|
||||
block instantiations that produce ResNets of various depths.
|
||||
Training for image classification on Imagenet is usually done with [224, 224]
|
||||
inputs, resulting in [7, 7] feature maps at the output of the last ResNet
|
||||
block for the ResNets defined in [1] that have nominal stride equal to 32.
|
||||
However, for dense prediction tasks we advise that one uses inputs with
|
||||
spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
|
||||
this case the feature maps at the ResNet output will have spatial shape
|
||||
[(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
|
||||
and corners exactly aligned with the input image corners, which greatly
|
||||
facilitates alignment of the features to the image. Using as input [225, 225]
|
||||
images results in [8, 8] feature maps at the output of the last ResNet block.
|
||||
For dense prediction tasks, the ResNet needs to run in fully-convolutional
|
||||
(FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
|
||||
have nominal stride equal to 32 and a good choice in FCN mode is to use
|
||||
output_stride=16 in order to increase the density of the computed features at
|
||||
small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
|
||||
Args:
|
||||
inputs: A tensor of size [batch, height_in, width_in, channels].
|
||||
blocks: A list of length equal to the number of ResNet blocks. Each element
|
||||
is a resnet_utils.Block object describing the units in the block.
|
||||
num_classes: Number of predicted classes for classification tasks. If None
|
||||
we return the features before the logit layer.
|
||||
is_training: whether is training or not.
|
||||
global_pool: If True, we perform global average pooling before computing the
|
||||
logits. Set to True for image classification, False for dense prediction.
|
||||
output_stride: If None, then the output will be computed at the nominal
|
||||
network stride. If output_stride is not None, it specifies the requested
|
||||
ratio of input to output spatial resolution.
|
||||
include_root_block: If True, include the initial convolution followed by
|
||||
max-pooling, if False excludes it.
|
||||
spatial_squeeze: if True, logits is of shape [B, C], if false logits is
|
||||
of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
|
||||
reuse: whether or not the network and its variables should be reused. To be
|
||||
able to reuse 'scope' must be given.
|
||||
scope: Optional variable_scope.
|
||||
Returns:
|
||||
net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
|
||||
If global_pool is False, then height_out and width_out are reduced by a
|
||||
factor of output_stride compared to the respective height_in and width_in,
|
||||
else both height_out and width_out equal one. If num_classes is None, then
|
||||
net is the output of the last ResNet block, potentially after global
|
||||
average pooling. If num_classes is not None, net contains the pre-softmax
|
||||
activations.
|
||||
end_points: A dictionary from components of the network to the corresponding
|
||||
activation.
|
||||
Raises:
|
||||
ValueError: If the target output_stride is not valid.
|
||||
"""
|
||||
with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
|
||||
end_points_collection = sc.name + '_end_points'
|
||||
with slim.arg_scope(
|
||||
[slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
|
||||
outputs_collections=end_points_collection):
|
||||
with slim.arg_scope([slim.batch_norm], is_training=is_training):
|
||||
net = inputs
|
||||
if include_root_block:
|
||||
if output_stride is not None:
|
||||
if output_stride % 4 != 0:
|
||||
raise ValueError(
|
||||
'The output_stride needs to be a multiple of 4.'
|
||||
)
|
||||
output_stride /= 4
|
||||
net = resnet_utils.conv2d_same(
|
||||
net, 64, 7, stride=2, scope='conv1')
|
||||
net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
|
||||
net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
|
||||
|
||||
net = slim.utils.collect_named_outputs(
|
||||
end_points_collection, 'pool2', net)
|
||||
|
||||
net = resnet_utils.stack_blocks_dense(net, blocks,
|
||||
output_stride)
|
||||
|
||||
end_points = slim.utils.convert_collection_to_dict(
|
||||
end_points_collection)
|
||||
|
||||
end_points['pool1'] = end_points['resnet_v1_18/block2/unit_2']
|
||||
end_points['pool2'] = end_points['resnet_v1_18/block3/unit_2']
|
||||
end_points['pool3'] = end_points['resnet_v1_18/block4/unit_2']
|
||||
end_points['pool4'] = end_points['resnet_v1_18/block5/unit_2']
|
||||
end_points['pool5'] = end_points['resnet_v1_18/block6/unit_2']
|
||||
end_points['pool6'] = net
|
||||
|
||||
return net, end_points
|
||||
|
||||
|
||||
resnet_v1.default_image_size = 224
|
||||
|
||||
|
||||
def resnet_v1_18(inputs,
|
||||
num_classes=None,
|
||||
is_training=True,
|
||||
global_pool=True,
|
||||
output_stride=None,
|
||||
spatial_squeeze=True,
|
||||
reuse=None,
|
||||
scope='resnet_v1_18'):
|
||||
"""ResNet-18 model of [1]. See resnet_v1() for arg and return description."""
|
||||
blocks = [
|
||||
resnet_utils.Block('block1', basicblock,
|
||||
[(64, 64, 1)] + [(64, 64, 1)]),
|
||||
resnet_utils.Block('block2', basicblock,
|
||||
[(128, 128, 1)] + [(128, 128, 1)]),
|
||||
resnet_utils.Block('block3', basicblock,
|
||||
[(256, 256, 2)] + [(256, 256, 1)]),
|
||||
resnet_utils.Block('block4', basicblock,
|
||||
[(512, 512, 2)] + [(512, 512, 1)]),
|
||||
resnet_utils.Block('block5', basicblock,
|
||||
[(256, 256, 2)] + [(256, 256, 1)]),
|
||||
resnet_utils.Block('block6', basicblock,
|
||||
[(256, 256, 2)] + [(256, 256, 1)]),
|
||||
resnet_utils.Block('block7', basicblock,
|
||||
[(256, 256, 2)] + [(256, 256, 1)]),
|
||||
]
|
||||
return resnet_v1(
|
||||
inputs,
|
||||
blocks,
|
||||
num_classes,
|
||||
is_training,
|
||||
global_pool=global_pool,
|
||||
output_stride=output_stride,
|
||||
include_root_block=True,
|
||||
spatial_squeeze=spatial_squeeze,
|
||||
reuse=reuse,
|
||||
scope=scope)
|
||||
|
||||
|
||||
resnet_v1_18.default_image_size = resnet_v1.default_image_size
|
||||
|
||||
|
||||
def resnet_v1_50(inputs,
|
||||
num_classes=None,
|
||||
is_training=True,
|
||||
global_pool=True,
|
||||
output_stride=None,
|
||||
spatial_squeeze=True,
|
||||
reuse=None,
|
||||
scope='resnet_v1_50'):
|
||||
"""ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
|
||||
blocks = [
|
||||
resnet_utils.Block('block1', bottleneck,
|
||||
[(256, 64, 1)] * 2 + [(256, 64, 2)]),
|
||||
resnet_utils.Block('block2', bottleneck,
|
||||
[(512, 128, 1)] * 3 + [(512, 128, 2)]),
|
||||
resnet_utils.Block('block3', bottleneck,
|
||||
[(1024, 256, 1)] * 5 + [(1024, 256, 2)]),
|
||||
resnet_utils.Block('block4', bottleneck,
|
||||
[(2048, 512, 1)] * 3 + [(2048, 512, 2)]),
|
||||
resnet_utils.Block('block5', bottleneck,
|
||||
[(1024, 256, 1)] * 2 + [(1024, 256, 2)]),
|
||||
resnet_utils.Block('block6', bottleneck, [(1024, 256, 1)] * 2),
|
||||
]
|
||||
return resnet_v1(
|
||||
inputs,
|
||||
blocks,
|
||||
num_classes,
|
||||
is_training,
|
||||
global_pool=global_pool,
|
||||
output_stride=output_stride,
|
||||
include_root_block=True,
|
||||
spatial_squeeze=spatial_squeeze,
|
||||
reuse=reuse,
|
||||
scope=scope)
|
||||
|
||||
|
||||
resnet_v1_50.default_image_size = resnet_v1.default_image_size
|
||||
|
||||
|
||||
def resnet_v1_101(inputs,
|
||||
num_classes=None,
|
||||
is_training=True,
|
||||
global_pool=True,
|
||||
output_stride=None,
|
||||
spatial_squeeze=True,
|
||||
reuse=None,
|
||||
scope='resnet_v1_101'):
|
||||
"""ResNet-101 model of [1]. See resnet_v1() for arg and return description."""
|
||||
blocks = [
|
||||
resnet_utils.Block('block1', bottleneck,
|
||||
[(256, 64, 1)] * 2 + [(256, 64, 2)]),
|
||||
resnet_utils.Block('block2', bottleneck,
|
||||
[(512, 128, 1)] * 3 + [(512, 128, 2)]),
|
||||
resnet_utils.Block('block3', bottleneck,
|
||||
[(1024, 256, 1)] * 22 + [(1024, 256, 2)]),
|
||||
resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
|
||||
]
|
||||
return resnet_v1(
|
||||
inputs,
|
||||
blocks,
|
||||
num_classes,
|
||||
is_training,
|
||||
global_pool=global_pool,
|
||||
output_stride=output_stride,
|
||||
include_root_block=True,
|
||||
spatial_squeeze=spatial_squeeze,
|
||||
reuse=reuse,
|
||||
scope=scope)
|
||||
|
||||
|
||||
resnet_v1_101.default_image_size = resnet_v1.default_image_size
|
||||
|
||||
|
||||
def resnet_v1_152(inputs,
|
||||
num_classes=None,
|
||||
is_training=True,
|
||||
global_pool=True,
|
||||
output_stride=None,
|
||||
spatial_squeeze=True,
|
||||
reuse=None,
|
||||
scope='resnet_v1_152'):
|
||||
"""ResNet-152 model of [1]. See resnet_v1() for arg and return description."""
|
||||
blocks = [
|
||||
resnet_utils.Block('block1', bottleneck,
|
||||
[(256, 64, 1)] * 2 + [(256, 64, 2)]),
|
||||
resnet_utils.Block('block2', bottleneck,
|
||||
[(512, 128, 1)] * 7 + [(512, 128, 2)]),
|
||||
resnet_utils.Block('block3', bottleneck,
|
||||
[(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
|
||||
resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
|
||||
]
|
||||
return resnet_v1(
|
||||
inputs,
|
||||
blocks,
|
||||
num_classes,
|
||||
is_training,
|
||||
global_pool=global_pool,
|
||||
output_stride=output_stride,
|
||||
include_root_block=True,
|
||||
spatial_squeeze=spatial_squeeze,
|
||||
reuse=reuse,
|
||||
scope=scope)
|
||||
|
||||
|
||||
resnet_v1_152.default_image_size = resnet_v1.default_image_size
|
||||
|
||||
|
||||
def resnet_v1_200(inputs,
|
||||
num_classes=None,
|
||||
is_training=True,
|
||||
global_pool=True,
|
||||
output_stride=None,
|
||||
spatial_squeeze=True,
|
||||
reuse=None,
|
||||
scope='resnet_v1_200'):
|
||||
"""ResNet-200 model of [2]. See resnet_v1() for arg and return description."""
|
||||
blocks = [
|
||||
resnet_utils.Block('block1', bottleneck,
|
||||
[(256, 64, 1)] * 2 + [(256, 64, 2)]),
|
||||
resnet_utils.Block('block2', bottleneck,
|
||||
[(512, 128, 1)] * 23 + [(512, 128, 2)]),
|
||||
resnet_utils.Block('block3', bottleneck,
|
||||
[(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
|
||||
resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
|
||||
]
|
||||
return resnet_v1(
|
||||
inputs,
|
||||
blocks,
|
||||
num_classes,
|
||||
is_training,
|
||||
global_pool=global_pool,
|
||||
output_stride=output_stride,
|
||||
include_root_block=True,
|
||||
spatial_squeeze=spatial_squeeze,
|
||||
reuse=reuse,
|
||||
scope=scope)
|
||||
|
||||
|
||||
resnet_v1_200.default_image_size = resnet_v1.default_image_size
|
||||
|
||||
if __name__ == '__main__':
|
||||
input = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name='input')
|
||||
with slim.arg_scope(resnet_arg_scope()) as sc:
|
||||
logits = resnet_v1_50(input)
|
||||
231
modelscope/pipelines/cv/ocr_utils/resnet_utils.py
Normal file
231
modelscope/pipelines/cv/ocr_utils/resnet_utils.py
Normal file
@@ -0,0 +1,231 @@
|
||||
"""Contains building blocks for various versions of Residual Networks.
|
||||
Residual networks (ResNets) were proposed in:
|
||||
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
|
||||
Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015
|
||||
More variants were introduced in:
|
||||
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
|
||||
Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016
|
||||
We can obtain different ResNet variants by changing the network depth, width,
|
||||
and form of residual unit. This module implements the infrastructure for
|
||||
building them. Concrete ResNet units and full ResNet networks are implemented in
|
||||
the accompanying resnet_v1.py and resnet_v2.py modules.
|
||||
Compared to https://github.com/KaimingHe/deep-residual-networks, in the current
|
||||
implementation we subsample the output activations in the last residual unit of
|
||||
each block, instead of subsampling the input activations in the first residual
|
||||
unit of each block. The two implementations give identical results but our
|
||||
implementation is more memory efficient.
|
||||
"""
|
||||
|
||||
import collections
|
||||
|
||||
import tensorflow as tf
|
||||
import tf_slim as slim
|
||||
|
||||
if tf.__version__ >= '2.0':
|
||||
tf = tf.compat.v1
|
||||
|
||||
|
||||
class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])):
|
||||
"""A named tuple describing a ResNet block.
|
||||
Its parts are:
|
||||
scope: The scope of the `Block`.
|
||||
unit_fn: The ResNet unit function which takes as input a `Tensor` and
|
||||
returns another `Tensor` with the output of the ResNet unit.
|
||||
args: A list of length equal to the number of units in the `Block`. The list
|
||||
contains one (depth, depth_bottleneck, stride) tuple for each unit in the
|
||||
block to serve as argument to unit_fn.
|
||||
"""
|
||||
|
||||
|
||||
def subsample(inputs, factor, scope=None):
|
||||
"""Subsamples the input along the spatial dimensions.
|
||||
Args:
|
||||
inputs: A `Tensor` of size [batch, height_in, width_in, channels].
|
||||
factor: The subsampling factor.
|
||||
scope: Optional variable_scope.
|
||||
Returns:
|
||||
output: A `Tensor` of size [batch, height_out, width_out, channels] with the
|
||||
input, either intact (if factor == 1) or subsampled (if factor > 1).
|
||||
"""
|
||||
if factor == 1:
|
||||
return inputs
|
||||
else:
|
||||
return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope)
|
||||
|
||||
|
||||
def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None):
|
||||
"""Strided 2-D convolution with 'SAME' padding.
|
||||
When stride > 1, then we do explicit zero-padding, followed by conv2d with
|
||||
'VALID' padding.
|
||||
Note that
|
||||
net = conv2d_same(inputs, num_outputs, 3, stride=stride)
|
||||
is equivalent to
|
||||
net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME')
|
||||
net = subsample(net, factor=stride)
|
||||
whereas
|
||||
net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME')
|
||||
is different when the input's height or width is even, which is why we add the
|
||||
current function. For more details, see ResnetUtilsTest.testConv2DSameEven().
|
||||
Args:
|
||||
inputs: A 4-D tensor of size [batch, height_in, width_in, channels].
|
||||
num_outputs: An integer, the number of output filters.
|
||||
kernel_size: An int with the kernel_size of the filters.
|
||||
stride: An integer, the output stride.
|
||||
rate: An integer, rate for atrous convolution.
|
||||
scope: Scope.
|
||||
Returns:
|
||||
output: A 4-D tensor of size [batch, height_out, width_out, channels] with
|
||||
the convolution output.
|
||||
"""
|
||||
if stride == 1:
|
||||
return slim.conv2d(
|
||||
inputs,
|
||||
num_outputs,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
rate=rate,
|
||||
padding='SAME',
|
||||
scope=scope)
|
||||
else:
|
||||
kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
|
||||
pad_total = kernel_size_effective - 1
|
||||
pad_beg = pad_total // 2
|
||||
pad_end = pad_total - pad_beg
|
||||
inputs = tf.pad(
|
||||
inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
|
||||
return slim.conv2d(
|
||||
inputs,
|
||||
num_outputs,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
rate=rate,
|
||||
padding='VALID',
|
||||
scope=scope)
|
||||
|
||||
|
||||
@slim.add_arg_scope
|
||||
def stack_blocks_dense(net,
|
||||
blocks,
|
||||
output_stride=None,
|
||||
outputs_collections=None):
|
||||
"""Stacks ResNet `Blocks` and controls output feature density.
|
||||
First, this function creates scopes for the ResNet in the form of
|
||||
'block_name/unit_1', 'block_name/unit_2', etc.
|
||||
Second, this function allows the user to explicitly control the ResNet
|
||||
output_stride, which is the ratio of the input to output spatial resolution.
|
||||
This is useful for dense prediction tasks such as semantic segmentation or
|
||||
object detection.
|
||||
Most ResNets consist of 4 ResNet blocks and subsample the activations by a
|
||||
factor of 2 when transitioning between consecutive ResNet blocks. This results
|
||||
to a nominal ResNet output_stride equal to 8. If we set the output_stride to
|
||||
half the nominal network stride (e.g., output_stride=4), then we compute
|
||||
responses twice.
|
||||
Control of the output feature density is implemented by atrous convolution.
|
||||
Args:
|
||||
net: A `Tensor` of size [batch, height, width, channels].
|
||||
blocks: A list of length equal to the number of ResNet `Blocks`. Each
|
||||
element is a ResNet `Block` object describing the units in the `Block`.
|
||||
output_stride: If `None`, then the output will be computed at the nominal
|
||||
network stride. If output_stride is not `None`, it specifies the requested
|
||||
ratio of input to output spatial resolution, which needs to be equal to
|
||||
the product of unit strides from the start up to some level of the ResNet.
|
||||
For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1,
|
||||
then valid values for the output_stride are 1, 2, 6, 24 or None (which
|
||||
is equivalent to output_stride=24).
|
||||
outputs_collections: Collection to add the ResNet block outputs.
|
||||
Returns:
|
||||
net: Output tensor with stride equal to the specified output_stride.
|
||||
Raises:
|
||||
ValueError: If the target output_stride is not valid.
|
||||
"""
|
||||
# The current_stride variable keeps track of the effective stride of the
|
||||
# activations. This allows us to invoke atrous convolution whenever applying
|
||||
# the next residual unit would result in the activations having stride larger
|
||||
# than the target output_stride.
|
||||
current_stride = 1
|
||||
|
||||
# The atrous convolution rate parameter.
|
||||
rate = 1
|
||||
|
||||
for block in blocks:
|
||||
with tf.variable_scope(block.scope, 'block', [net]):
|
||||
for i, unit in enumerate(block.args):
|
||||
if output_stride is not None and current_stride > output_stride:
|
||||
raise ValueError(
|
||||
'The target output_stride cannot be reached.')
|
||||
|
||||
with tf.variable_scope(
|
||||
'unit_%d' % (i + 1), values=[net]) as sc:
|
||||
unit_depth, unit_depth_bottleneck, unit_stride = unit
|
||||
# If we have reached the target output_stride, then we need to employ
|
||||
# atrous convolution with stride=1 and multiply the atrous rate by the
|
||||
# current unit's stride for use in subsequent layers.
|
||||
if output_stride is not None and current_stride == output_stride:
|
||||
net = block.unit_fn(
|
||||
net,
|
||||
depth=unit_depth,
|
||||
depth_bottleneck=unit_depth_bottleneck,
|
||||
stride=1,
|
||||
rate=rate)
|
||||
rate *= unit_stride
|
||||
|
||||
else:
|
||||
net = block.unit_fn(
|
||||
net,
|
||||
depth=unit_depth,
|
||||
depth_bottleneck=unit_depth_bottleneck,
|
||||
stride=unit_stride,
|
||||
rate=1)
|
||||
current_stride *= unit_stride
|
||||
net = slim.utils.collect_named_outputs(
|
||||
outputs_collections, sc.name, net)
|
||||
|
||||
if output_stride is not None and current_stride != output_stride:
|
||||
raise ValueError('The target output_stride cannot be reached.')
|
||||
|
||||
return net
|
||||
|
||||
|
||||
def resnet_arg_scope(weight_decay=0.0001,
|
||||
batch_norm_decay=0.997,
|
||||
batch_norm_epsilon=1e-5,
|
||||
batch_norm_scale=True):
|
||||
"""Defines the default ResNet arg scope.
|
||||
TODO(gpapan): The batch-normalization related default values above are
|
||||
appropriate for use in conjunction with the reference ResNet models
|
||||
released at https://github.com/KaimingHe/deep-residual-networks. When
|
||||
training ResNets from scratch, they might need to be tuned.
|
||||
Args:
|
||||
weight_decay: The weight decay to use for regularizing the model.
|
||||
batch_norm_decay: The moving average decay when estimating layer activation
|
||||
statistics in batch normalization.
|
||||
batch_norm_epsilon: Small constant to prevent division by zero when
|
||||
normalizing activations by their variance in batch normalization.
|
||||
batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
|
||||
activations in the batch normalization layer.
|
||||
Returns:
|
||||
An `arg_scope` to use for the resnet models.
|
||||
"""
|
||||
batch_norm_params = {
|
||||
'decay': batch_norm_decay,
|
||||
'epsilon': batch_norm_epsilon,
|
||||
'scale': batch_norm_scale,
|
||||
'updates_collections': tf.GraphKeys.UPDATE_OPS,
|
||||
}
|
||||
|
||||
with slim.arg_scope(
|
||||
[slim.conv2d],
|
||||
weights_regularizer=slim.l2_regularizer(weight_decay),
|
||||
weights_initializer=slim.variance_scaling_initializer(),
|
||||
activation_fn=tf.nn.relu,
|
||||
normalizer_fn=slim.batch_norm,
|
||||
normalizer_params=batch_norm_params):
|
||||
with slim.arg_scope([slim.batch_norm], **batch_norm_params):
|
||||
# The following implies padding='SAME' for pool1, which makes feature
|
||||
# alignment easier for dense prediction tasks. This is also used in
|
||||
# https://github.com/facebook/fb.resnet.torch. However the accompanying
|
||||
# code of 'Deep Residual Learning for Image Recognition' uses
|
||||
# padding='VALID' for pool1. You can switch to that choice by setting
|
||||
# slim.arg_scope([slim.max_pool2d], padding='VALID').
|
||||
with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
|
||||
return arg_sc
|
||||
108
modelscope/pipelines/cv/ocr_utils/utils.py
Normal file
108
modelscope/pipelines/cv/ocr_utils/utils.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
def rboxes_to_polygons(rboxes):
|
||||
"""
|
||||
Convert rboxes to polygons
|
||||
ARGS
|
||||
`rboxes`: [n, 5]
|
||||
RETURN
|
||||
`polygons`: [n, 8]
|
||||
"""
|
||||
|
||||
theta = rboxes[:, 4:5]
|
||||
cxcy = rboxes[:, :2]
|
||||
half_w = rboxes[:, 2:3] / 2.
|
||||
half_h = rboxes[:, 3:4] / 2.
|
||||
v1 = np.hstack([np.cos(theta) * half_w, np.sin(theta) * half_w])
|
||||
v2 = np.hstack([-np.sin(theta) * half_h, np.cos(theta) * half_h])
|
||||
p1 = cxcy - v1 - v2
|
||||
p2 = cxcy + v1 - v2
|
||||
p3 = cxcy + v1 + v2
|
||||
p4 = cxcy - v1 + v2
|
||||
polygons = np.hstack([p1, p2, p3, p4])
|
||||
return polygons
|
||||
|
||||
|
||||
def cal_width(box):
|
||||
pd1 = point_dist(box[0], box[1], box[2], box[3])
|
||||
pd2 = point_dist(box[4], box[5], box[6], box[7])
|
||||
return (pd1 + pd2) / 2
|
||||
|
||||
|
||||
def point_dist(x1, y1, x2, y2):
|
||||
return np.sqrt((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1))
|
||||
|
||||
|
||||
def draw_polygons(img, polygons):
|
||||
for p in polygons.tolist():
|
||||
p = [int(o) for o in p]
|
||||
cv2.line(img, (p[0], p[1]), (p[2], p[3]), (0, 255, 0), 1)
|
||||
cv2.line(img, (p[2], p[3]), (p[4], p[5]), (0, 255, 0), 1)
|
||||
cv2.line(img, (p[4], p[5]), (p[6], p[7]), (0, 255, 0), 1)
|
||||
cv2.line(img, (p[6], p[7]), (p[0], p[1]), (0, 255, 0), 1)
|
||||
return img
|
||||
|
||||
|
||||
def nms_python(boxes):
|
||||
boxes = sorted(boxes, key=lambda x: -x[8])
|
||||
nms_flag = [True] * len(boxes)
|
||||
for i, a in enumerate(boxes):
|
||||
if not nms_flag[i]:
|
||||
continue
|
||||
else:
|
||||
for j, b in enumerate(boxes):
|
||||
if not j > i:
|
||||
continue
|
||||
if not nms_flag[j]:
|
||||
continue
|
||||
score_a = a[8]
|
||||
score_b = b[8]
|
||||
rbox_a = polygon2rbox(a[:8])
|
||||
rbox_b = polygon2rbox(b[:8])
|
||||
if point_in_rbox(rbox_a[:2], rbox_b) or point_in_rbox(
|
||||
rbox_b[:2], rbox_a):
|
||||
if score_a > score_b:
|
||||
nms_flag[j] = False
|
||||
boxes_nms = []
|
||||
for i, box in enumerate(boxes):
|
||||
if nms_flag[i]:
|
||||
boxes_nms.append(box)
|
||||
return boxes_nms
|
||||
|
||||
|
||||
def point_in_rbox(c, rbox):
|
||||
cx0, cy0 = c[0], c[1]
|
||||
cx1, cy1 = rbox[0], rbox[1]
|
||||
w, h = rbox[2], rbox[3]
|
||||
theta = rbox[4]
|
||||
dist_x = np.abs((cx1 - cx0) * np.cos(theta) + (cy1 - cy0) * np.sin(theta))
|
||||
dist_y = np.abs(-(cx1 - cx0) * np.sin(theta) + (cy1 - cy0) * np.cos(theta))
|
||||
return ((dist_x < w / 2.0) and (dist_y < h / 2.0))
|
||||
|
||||
|
||||
def polygon2rbox(polygon):
|
||||
x1, x2, x3, x4 = polygon[0], polygon[2], polygon[4], polygon[6]
|
||||
y1, y2, y3, y4 = polygon[1], polygon[3], polygon[5], polygon[7]
|
||||
c_x = (x1 + x2 + x3 + x4) / 4
|
||||
c_y = (y1 + y2 + y3 + y4) / 4
|
||||
w1 = point_dist(x1, y1, x2, y2)
|
||||
w2 = point_dist(x3, y3, x4, y4)
|
||||
h1 = point_line_dist(c_x, c_y, x1, y1, x2, y2)
|
||||
h2 = point_line_dist(c_x, c_y, x3, y3, x4, y4)
|
||||
h = h1 + h2
|
||||
w = (w1 + w2) / 2
|
||||
theta1 = np.arctan2(y2 - y1, x2 - x1)
|
||||
theta2 = np.arctan2(y3 - y4, x3 - x4)
|
||||
theta = (theta1 + theta2) / 2.0
|
||||
return [c_x, c_y, w, h, theta]
|
||||
|
||||
|
||||
def point_line_dist(px, py, x1, y1, x2, y2):
|
||||
eps = 1e-6
|
||||
dx = x2 - x1
|
||||
dy = y2 - y1
|
||||
div = np.sqrt(dx * dx + dy * dy) + eps
|
||||
dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div
|
||||
return dist
|
||||
@@ -1 +1 @@
|
||||
from .image_captioning import ImageCaptionPipeline
|
||||
from .image_captioning_pipeline import ImageCaptionPipeline
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
from modelscope.preprocessors import OfaImageCaptionPreprocessor, Preprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from ..base import Model, Pipeline
|
||||
from ..builder import PIPELINES
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa')
|
||||
class ImageCaptionPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[Model, str],
|
||||
preprocessor: [Preprocessor] = None,
|
||||
**kwargs):
|
||||
super().__init__()
|
||||
assert isinstance(model, str) or isinstance(model, Model), \
|
||||
'model must be a single str or OfaForImageCaptioning'
|
||||
if isinstance(model, str):
|
||||
pipe_model = Model.from_pretrained(model)
|
||||
elif isinstance(model, Model):
|
||||
pipe_model = model
|
||||
else:
|
||||
raise NotImplementedError
|
||||
if preprocessor is None and pipe_model:
|
||||
preprocessor = OfaImageCaptionPreprocessor(model_dir=model)
|
||||
super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return inputs
|
||||
@@ -1,4 +1,5 @@
|
||||
from .sentence_similarity_pipeline import * # noqa F403
|
||||
from .sequence_classification_pipeline import * # noqa F403
|
||||
from .text_generation_pipeline import * # noqa F403
|
||||
from .word_segmentation_pipeline import * # noqa F403
|
||||
from .zero_shot_classification_pipeline import * # noqa F403
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
import os
|
||||
import uuid
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
from modelscope.models.nlp import SbertForSentenceSimilarity
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
import os
|
||||
import uuid
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
from modelscope.models.nlp import BertForSequenceClassification
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.nlp import PalmForTextGenerationModel
|
||||
from modelscope.models.nlp import PalmForTextGeneration
|
||||
from modelscope.preprocessors import TextGenerationPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Pipeline, Tensor
|
||||
@@ -10,11 +10,11 @@ from ..builder import PIPELINES
|
||||
__all__ = ['TextGenerationPipeline']
|
||||
|
||||
|
||||
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm')
|
||||
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0')
|
||||
class TextGenerationPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[PalmForTextGenerationModel, str],
|
||||
model: Union[PalmForTextGeneration, str],
|
||||
preprocessor: Optional[TextGenerationPreprocessor] = None,
|
||||
**kwargs):
|
||||
"""use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
|
||||
@@ -23,16 +23,16 @@ class TextGenerationPipeline(Pipeline):
|
||||
model (SequenceClassificationModel): a model instance
|
||||
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
|
||||
"""
|
||||
sc_model = model if isinstance(
|
||||
model,
|
||||
PalmForTextGenerationModel) else Model.from_pretrained(model)
|
||||
model = model if isinstance(
|
||||
model, PalmForTextGeneration) else Model.from_pretrained(model)
|
||||
if preprocessor is None:
|
||||
preprocessor = TextGenerationPreprocessor(
|
||||
sc_model.model_dir,
|
||||
model.model_dir,
|
||||
model.tokenizer,
|
||||
first_sequence='sentence',
|
||||
second_sequence=None)
|
||||
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
|
||||
self.tokenizer = preprocessor.tokenizer
|
||||
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
|
||||
self.tokenizer = model.tokenizer
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
|
||||
"""process the prediction results
|
||||
@@ -43,17 +43,20 @@ class TextGenerationPipeline(Pipeline):
|
||||
Returns:
|
||||
Dict[str, str]: the prediction results
|
||||
"""
|
||||
replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
|
||||
('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
|
||||
('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
|
||||
replace_tokens_roberta = ((r' +', ' '), ('<mask>', '<q>'), ('<pad>',
|
||||
''),
|
||||
('<s>', ''), ('</s>', ''), ('<unk>', ' '))
|
||||
|
||||
vocab_size = len(self.tokenizer.vocab)
|
||||
pred_list = inputs['predictions']
|
||||
pred_ids = pred_list[0][0].cpu().numpy().tolist()
|
||||
for j in range(len(pred_ids)):
|
||||
if pred_ids[j] >= vocab_size:
|
||||
pred_ids[j] = 100
|
||||
pred = self.tokenizer.convert_ids_to_tokens(pred_ids)
|
||||
pred_string = ''.join(pred).replace(
|
||||
'##',
|
||||
'').split('[SEP]')[0].replace('[CLS]',
|
||||
'').replace('[SEP]',
|
||||
'').replace('[UNK]', '')
|
||||
pred_string = self.tokenizer.decode(pred_ids)
|
||||
for _old, _new in replace_tokens_bert:
|
||||
pred_string = pred_string.replace(_old, _new)
|
||||
pred_string.strip()
|
||||
for _old, _new in replace_tokens_roberta:
|
||||
pred_string = pred_string.replace(_old, _new)
|
||||
pred_string.strip()
|
||||
return {'text': pred_string}
|
||||
|
||||
69
modelscope/pipelines/nlp/word_segmentation_pipeline.py
Normal file
69
modelscope/pipelines/nlp/word_segmentation_pipeline.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.nlp import StructBertForTokenClassification
|
||||
from modelscope.preprocessors import TokenClassifcationPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Pipeline, Tensor
|
||||
from ..builder import PIPELINES
|
||||
|
||||
__all__ = ['WordSegmentationPipeline']
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.word_segmentation,
|
||||
module_name=r'structbert-chinese-word-segmentation')
|
||||
class WordSegmentationPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[StructBertForTokenClassification, str],
|
||||
preprocessor: Optional[TokenClassifcationPreprocessor] = None,
|
||||
**kwargs):
|
||||
"""use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
|
||||
|
||||
Args:
|
||||
model (StructBertForTokenClassification): a model instance
|
||||
preprocessor (TokenClassifcationPreprocessor): a preprocessor instance
|
||||
"""
|
||||
model = model if isinstance(
|
||||
model,
|
||||
StructBertForTokenClassification) else Model.from_pretrained(model)
|
||||
if preprocessor is None:
|
||||
preprocessor = TokenClassifcationPreprocessor(model.model_dir)
|
||||
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
|
||||
self.tokenizer = preprocessor.tokenizer
|
||||
self.config = model.config
|
||||
self.id2label = self.config.id2label
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""process the prediction results
|
||||
|
||||
Args:
|
||||
inputs (Dict[str, Any]): _description_
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: the prediction results
|
||||
"""
|
||||
|
||||
pred_list = inputs['predictions']
|
||||
labels = []
|
||||
for pre in pred_list:
|
||||
labels.append(self.id2label[pre])
|
||||
labels = labels[1:-1]
|
||||
chunks = []
|
||||
chunk = ''
|
||||
assert len(inputs['text']) == len(labels)
|
||||
for token, label in zip(inputs['text'], labels):
|
||||
if label[0] == 'B' or label[0] == 'I':
|
||||
chunk += token
|
||||
else:
|
||||
chunk += token
|
||||
chunks.append(chunk)
|
||||
chunk = ''
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
seg_result = ' '.join(chunks)
|
||||
rst = {
|
||||
'output': seg_result,
|
||||
}
|
||||
return rst
|
||||
@@ -54,6 +54,13 @@ TASK_OUTPUTS = {
|
||||
# }
|
||||
Tasks.pose_estimation: ['poses', 'boxes'],
|
||||
|
||||
# ocr detection result for single sample
|
||||
# {
|
||||
# "det_polygons": np.array with shape [num_text, 8], each box is
|
||||
# [x1, y1, x2, y2, x3, y3, x4, y4]
|
||||
# }
|
||||
Tasks.ocr_detection: ['det_polygons'],
|
||||
|
||||
# ============ nlp tasks ===================
|
||||
|
||||
# text classification result for single sample
|
||||
@@ -69,8 +76,27 @@ TASK_OUTPUTS = {
|
||||
# }
|
||||
Tasks.text_generation: ['text'],
|
||||
|
||||
# word segmentation result for single sample
|
||||
# {
|
||||
# "output": "今天 天气 不错 , 适合 出去 游玩"
|
||||
# }
|
||||
Tasks.word_segmentation: ['output'],
|
||||
|
||||
# sentence similarity result for single sample
|
||||
# {
|
||||
# "labels": "1",
|
||||
# "scores": 0.9
|
||||
# }
|
||||
Tasks.sentence_similarity: ['scores', 'labels'],
|
||||
|
||||
# ============ audio tasks ===================
|
||||
|
||||
# audio processed for single file in PCM format
|
||||
# {
|
||||
# "output_pcm": np.array with shape(samples,) and dtype float32
|
||||
# }
|
||||
Tasks.speech_signal_process: ['output_pcm'],
|
||||
|
||||
# ============ multi-modal tasks ===================
|
||||
|
||||
# image caption result for single sample
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from .audio import LinearAECAndFbank
|
||||
from .base import Preprocessor
|
||||
from .builder import PREPROCESSORS, build_preprocessor
|
||||
from .common import Compose
|
||||
from .image import LoadImage, load_image
|
||||
from .multi_model import OfaImageCaptionPreprocessor
|
||||
from .nlp import * # noqa F403
|
||||
from .text_to_speech import * # noqa F403
|
||||
|
||||
231
modelscope/preprocessors/audio.py
Normal file
231
modelscope/preprocessors/audio.py
Normal file
@@ -0,0 +1,231 @@
|
||||
import ctypes
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
import scipy.io.wavfile as wav
|
||||
import torch
|
||||
from numpy.ctypeslib import ndpointer
|
||||
|
||||
from modelscope.utils.constant import Fields
|
||||
from .builder import PREPROCESSORS
|
||||
|
||||
|
||||
def load_wav(path):
|
||||
samp_rate, data = wav.read(path)
|
||||
return np.float32(data), samp_rate
|
||||
|
||||
|
||||
def load_library(libaec):
|
||||
libaec_in_cwd = os.path.join('.', libaec)
|
||||
if os.path.exists(libaec_in_cwd):
|
||||
libaec = libaec_in_cwd
|
||||
mitaec = ctypes.cdll.LoadLibrary(libaec)
|
||||
fe_process = mitaec.fe_process_inst
|
||||
fe_process.argtypes = [
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int,
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS')
|
||||
]
|
||||
return fe_process
|
||||
|
||||
|
||||
def do_linear_aec(fe_process, mic, ref, int16range=True):
|
||||
mic = np.float32(mic)
|
||||
ref = np.float32(ref)
|
||||
if len(mic) > len(ref):
|
||||
mic = mic[:len(ref)]
|
||||
out_mic = np.zeros_like(mic)
|
||||
out_linear = np.zeros_like(mic)
|
||||
out_echo = np.zeros_like(mic)
|
||||
out_ref = np.zeros_like(mic)
|
||||
if int16range:
|
||||
mic /= 32768
|
||||
ref /= 32768
|
||||
fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo)
|
||||
# out_ref not in use here
|
||||
if int16range:
|
||||
out_mic *= 32768
|
||||
out_linear *= 32768
|
||||
out_echo *= 32768
|
||||
return out_mic, out_ref, out_linear, out_echo
|
||||
|
||||
|
||||
def load_kaldi_feature_transform(filename):
|
||||
fp = open(filename, 'r')
|
||||
all_str = fp.read()
|
||||
pos1 = all_str.find('AddShift')
|
||||
pos2 = all_str.find('[', pos1)
|
||||
pos3 = all_str.find(']', pos2)
|
||||
mean = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
|
||||
pos1 = all_str.find('Rescale')
|
||||
pos2 = all_str.find('[', pos1)
|
||||
pos3 = all_str.find(']', pos2)
|
||||
scale = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
|
||||
fp.close()
|
||||
return mean, scale
|
||||
|
||||
|
||||
class Feature:
|
||||
r"""Extract feat from one utterance.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
fbank_config,
|
||||
feat_type='spec',
|
||||
mvn_file=None,
|
||||
cuda=False):
|
||||
r"""
|
||||
|
||||
Args:
|
||||
fbank_config (dict):
|
||||
feat_type (str):
|
||||
raw: do nothing
|
||||
fbank: use kaldi.fbank
|
||||
spec: Real/Imag
|
||||
logpow: log(1+|x|^2)
|
||||
mvn_file (str): the path of data file for mean variance normalization
|
||||
cuda:
|
||||
"""
|
||||
self.fbank_config = fbank_config
|
||||
self.feat_type = feat_type
|
||||
self.n_fft = fbank_config['frame_length'] * fbank_config[
|
||||
'sample_frequency'] // 1000
|
||||
self.hop_length = fbank_config['frame_shift'] * fbank_config[
|
||||
'sample_frequency'] // 1000
|
||||
self.window = torch.hamming_window(self.n_fft, periodic=False)
|
||||
|
||||
self.mvn = False
|
||||
if mvn_file is not None and os.path.exists(mvn_file):
|
||||
print(f'loading mvn file: {mvn_file}')
|
||||
shift, scale = load_kaldi_feature_transform(mvn_file)
|
||||
self.shift = torch.from_numpy(shift)
|
||||
self.scale = torch.from_numpy(scale)
|
||||
self.mvn = True
|
||||
if cuda:
|
||||
self.window = self.window.cuda()
|
||||
if self.mvn:
|
||||
self.shift = self.shift.cuda()
|
||||
self.scale = self.scale.cuda()
|
||||
|
||||
def compute(self, utt):
|
||||
r"""
|
||||
|
||||
Args:
|
||||
utt: in [-32768, 32767] range
|
||||
|
||||
Returns:
|
||||
[..., T, F]
|
||||
"""
|
||||
if self.feat_type == 'raw':
|
||||
return utt
|
||||
elif self.feat_type == 'fbank':
|
||||
# have to use local import before modelscope framework supoort lazy loading
|
||||
import torchaudio.compliance.kaldi as kaldi
|
||||
if len(utt.shape) == 1:
|
||||
utt = utt.unsqueeze(0)
|
||||
feat = kaldi.fbank(utt, **self.fbank_config)
|
||||
elif self.feat_type == 'spec':
|
||||
spec = torch.stft(
|
||||
utt / 32768,
|
||||
self.n_fft,
|
||||
self.hop_length,
|
||||
self.n_fft,
|
||||
self.window,
|
||||
center=False,
|
||||
return_complex=True)
|
||||
feat = torch.cat([spec.real, spec.imag], dim=-2).permute(-1, -2)
|
||||
elif self.feat_type == 'logpow':
|
||||
spec = torch.stft(
|
||||
utt,
|
||||
self.n_fft,
|
||||
self.hop_length,
|
||||
self.n_fft,
|
||||
self.window,
|
||||
center=False,
|
||||
return_complex=True)
|
||||
abspow = torch.abs(spec)**2
|
||||
feat = torch.log(1 + abspow).permute(-1, -2)
|
||||
return feat
|
||||
|
||||
def normalize(self, feat):
|
||||
if self.mvn:
|
||||
feat = feat + self.shift
|
||||
feat = feat * self.scale
|
||||
return feat
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(Fields.audio)
|
||||
class LinearAECAndFbank:
|
||||
SAMPLE_RATE = 16000
|
||||
|
||||
def __init__(self, io_config):
|
||||
self.trunc_length = 7200 * self.SAMPLE_RATE
|
||||
self.linear_aec_delay = io_config['linear_aec_delay']
|
||||
self.feature = Feature(io_config['fbank_config'],
|
||||
io_config['feat_type'], io_config['mvn'])
|
||||
self.mitaec = load_library(io_config['mitaec_library'])
|
||||
self.mask_on_mic = io_config['mask_on'] == 'nearend_mic'
|
||||
|
||||
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
""" linear filtering the near end mic and far end audio, then extract the feature
|
||||
:param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech"
|
||||
:return: dict with two keys and Tensor values: "base" linear filtered audio,and "feature"
|
||||
"""
|
||||
# read files
|
||||
nearend_mic, fs = load_wav(data['nearend_mic'])
|
||||
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
|
||||
farend_speech, fs = load_wav(data['farend_speech'])
|
||||
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
|
||||
if 'nearend_speech' in data:
|
||||
nearend_speech, fs = load_wav(data['nearend_speech'])
|
||||
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
|
||||
else:
|
||||
nearend_speech = np.zeros_like(nearend_mic)
|
||||
|
||||
out_mic, out_ref, out_linear, out_echo = do_linear_aec(
|
||||
self.mitaec, nearend_mic, farend_speech)
|
||||
# fix 20ms linear aec delay by delaying the target speech
|
||||
extra_zeros = np.zeros([int(self.linear_aec_delay * fs)])
|
||||
nearend_speech = np.concatenate([extra_zeros, nearend_speech])
|
||||
# truncate files to the same length
|
||||
flen = min(
|
||||
len(out_mic), len(out_ref), len(out_linear), len(out_echo),
|
||||
len(nearend_speech))
|
||||
fstart = 0
|
||||
flen = min(flen, self.trunc_length)
|
||||
nearend_mic, out_ref, out_linear, out_echo, nearend_speech = (
|
||||
out_mic[fstart:flen], out_ref[fstart:flen],
|
||||
out_linear[fstart:flen], out_echo[fstart:flen],
|
||||
nearend_speech[fstart:flen])
|
||||
|
||||
# extract features (frames, [mic, linear, ref, aes?])
|
||||
feat = torch.FloatTensor()
|
||||
|
||||
nearend_mic = torch.from_numpy(np.float32(nearend_mic))
|
||||
fbank_nearend_mic = self.feature.compute(nearend_mic)
|
||||
feat = torch.cat([feat, fbank_nearend_mic], dim=1)
|
||||
|
||||
out_linear = torch.from_numpy(np.float32(out_linear))
|
||||
fbank_out_linear = self.feature.compute(out_linear)
|
||||
feat = torch.cat([feat, fbank_out_linear], dim=1)
|
||||
|
||||
out_echo = torch.from_numpy(np.float32(out_echo))
|
||||
fbank_out_echo = self.feature.compute(out_echo)
|
||||
feat = torch.cat([feat, fbank_out_echo], dim=1)
|
||||
|
||||
# feature transform
|
||||
feat = self.feature.normalize(feat)
|
||||
|
||||
# prepare target
|
||||
if nearend_speech is not None:
|
||||
nearend_speech = torch.from_numpy(np.float32(nearend_speech))
|
||||
|
||||
if self.mask_on_mic:
|
||||
base = nearend_mic
|
||||
else:
|
||||
base = out_linear
|
||||
out_data = {'base': base, 'target': nearend_speech, 'feature': feat}
|
||||
return out_data
|
||||
@@ -1,32 +1,50 @@
|
||||
from typing import Any, Dict
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os.path as osp
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from maas_hub.snapshot_download import snapshot_download
|
||||
from PIL import Image
|
||||
|
||||
from modelscope.pipelines.base import Input
|
||||
from modelscope.preprocessors import load_image
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from ..base import Pipeline
|
||||
from ..builder import PIPELINES
|
||||
from modelscope.utils.constant import Fields, ModelFile
|
||||
from modelscope.utils.hub import get_model_cache_dir
|
||||
from modelscope.utils.type_assert import type_assert
|
||||
from .base import Preprocessor
|
||||
from .builder import PREPROCESSORS
|
||||
from .image import load_image
|
||||
|
||||
logger = get_logger()
|
||||
__all__ = [
|
||||
'OfaImageCaptionPreprocessor',
|
||||
]
|
||||
|
||||
|
||||
@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa')
|
||||
class ImageCaptionPipeline(Pipeline):
|
||||
# TODO: refine using modelhub
|
||||
def __init__(self, model: str, bpe_dir: str):
|
||||
super().__init__()
|
||||
# turn on cuda if GPU is available
|
||||
@PREPROCESSORS.register_module(
|
||||
Fields.multi_modal, module_name=r'ofa-image-caption')
|
||||
class OfaImageCaptionPreprocessor(Preprocessor):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""preprocess the data via the vocab.txt from the `model_dir` path
|
||||
|
||||
Args:
|
||||
model_dir (str): model path
|
||||
"""
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if osp.exists(model_dir):
|
||||
local_model_dir = model_dir
|
||||
else:
|
||||
cache_path = get_model_cache_dir(model_dir)
|
||||
local_model_dir = cache_path if osp.exists(
|
||||
cache_path) else snapshot_download(model_dir)
|
||||
local_model = osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE)
|
||||
bpe_dir = local_model_dir
|
||||
|
||||
from fairseq import checkpoint_utils, tasks, utils
|
||||
from ofa.tasks.mm_tasks import CaptionTask
|
||||
|
||||
tasks.register_task('caption', CaptionTask)
|
||||
use_cuda = False
|
||||
# use fp16 only when GPU is available
|
||||
use_fp16 = False
|
||||
|
||||
overrides = {
|
||||
'bpe_dir': bpe_dir,
|
||||
'eval_cider': False,
|
||||
@@ -35,21 +53,9 @@ class ImageCaptionPipeline(Pipeline):
|
||||
'no_repeat_ngram_size': 3,
|
||||
'seed': 7
|
||||
}
|
||||
models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
utils.split_paths(model), arg_overrides=overrides)
|
||||
|
||||
# Move models to GPU
|
||||
for model in models:
|
||||
model.eval()
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
if use_fp16:
|
||||
model.half()
|
||||
model.prepare_for_inference_(cfg)
|
||||
self.models = models
|
||||
# Initialize generator
|
||||
self.generator = task.build_generator(models, cfg.generation)
|
||||
|
||||
model, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
utils.split_paths(local_model), arg_overrides=overrides)
|
||||
del model
|
||||
# Initialize transform
|
||||
from torchvision import transforms
|
||||
mean = [0.5, 0.5, 0.5]
|
||||
@@ -69,7 +75,8 @@ class ImageCaptionPipeline(Pipeline):
|
||||
self.eos_item = torch.LongTensor([task.src_dict.eos()])
|
||||
self.pad_idx = task.src_dict.pad()
|
||||
|
||||
def preprocess(self, input: Input) -> Dict[str, Any]:
|
||||
@type_assert(object, (str, tuple))
|
||||
def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
|
||||
|
||||
def encode_text(text, length=None, append_bos=False, append_eos=False):
|
||||
s = self.task.tgt_dict.encode_line(
|
||||
@@ -88,7 +95,7 @@ class ImageCaptionPipeline(Pipeline):
|
||||
patch_image = self.patch_resize_transform(input).unsqueeze(0)
|
||||
else:
|
||||
patch_image = self.patch_resize_transform(
|
||||
load_image(input)).unsqueeze(0)
|
||||
load_image(data)).unsqueeze(0)
|
||||
patch_mask = torch.tensor([True])
|
||||
text = 'what does the image describe?'
|
||||
src_text = encode_text(
|
||||
@@ -105,17 +112,3 @@ class ImageCaptionPipeline(Pipeline):
|
||||
}
|
||||
}
|
||||
return sample
|
||||
|
||||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||
from ofa.utils.eval_utils import eval_caption
|
||||
|
||||
results, _ = eval_caption(self.task, self.generator, self.models,
|
||||
input)
|
||||
return {
|
||||
'image_id': results[0]['image_id'],
|
||||
'caption': results[0]['caption']
|
||||
}
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
# What should we do here ?
|
||||
return inputs
|
||||
@@ -12,7 +12,8 @@ from .builder import PREPROCESSORS
|
||||
|
||||
__all__ = [
|
||||
'Tokenize', 'SequenceClassificationPreprocessor',
|
||||
'TextGenerationPreprocessor', 'ZeroShotClassificationPreprocessor'
|
||||
'TextGenerationPreprocessor', 'ZeroShotClassificationPreprocessor',
|
||||
'TokenClassifcationPreprocessor'
|
||||
]
|
||||
|
||||
|
||||
@@ -53,12 +54,12 @@ class SequenceClassificationPreprocessor(Preprocessor):
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
|
||||
print(f'this is the tokenzier {self.tokenizer}')
|
||||
|
||||
@type_assert(object, (str, tuple))
|
||||
def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
|
||||
@type_assert(object, (str, tuple, Dict))
|
||||
def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
|
||||
"""process the raw input data
|
||||
|
||||
Args:
|
||||
data (str or tuple):
|
||||
data (str or tuple, Dict):
|
||||
sentence1 (str): a sentence
|
||||
Example:
|
||||
'you are so handsome.'
|
||||
@@ -70,22 +71,31 @@ class SequenceClassificationPreprocessor(Preprocessor):
|
||||
sentence2 (str): a sentence
|
||||
Example:
|
||||
'you are so beautiful.'
|
||||
or
|
||||
{field1: field_value1, field2: field_value2}
|
||||
field1 (str): field name, default 'first_sequence'
|
||||
field_value1 (str): a sentence
|
||||
Example:
|
||||
'you are so handsome.'
|
||||
|
||||
field2 (str): field name, default 'second_sequence'
|
||||
field_value2 (str): a sentence
|
||||
Example:
|
||||
'you are so beautiful.'
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: the preprocessed data
|
||||
"""
|
||||
|
||||
if not isinstance(data, tuple):
|
||||
data = (
|
||||
data,
|
||||
None,
|
||||
)
|
||||
|
||||
sentence1, sentence2 = data
|
||||
new_data = {
|
||||
self.first_sequence: sentence1,
|
||||
self.second_sequence: sentence2
|
||||
}
|
||||
if isinstance(data, str):
|
||||
new_data = {self.first_sequence: data}
|
||||
elif isinstance(data, tuple):
|
||||
sentence1, sentence2 = data
|
||||
new_data = {
|
||||
self.first_sequence: sentence1,
|
||||
self.second_sequence: sentence2
|
||||
}
|
||||
else:
|
||||
new_data = data
|
||||
|
||||
# preprocess the data for the model input
|
||||
|
||||
@@ -115,17 +125,15 @@ class SequenceClassificationPreprocessor(Preprocessor):
|
||||
return rst
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm')
|
||||
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0')
|
||||
class TextGenerationPreprocessor(Preprocessor):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
|
||||
"""preprocess the data using the vocab.txt from the `model_dir` path
|
||||
|
||||
Args:
|
||||
model_dir (str): model path
|
||||
"""
|
||||
from sofa import PalmTokenizer
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.model_dir: str = model_dir
|
||||
@@ -134,7 +142,7 @@ class TextGenerationPreprocessor(Preprocessor):
|
||||
self.second_sequence: str = kwargs.pop('second_sequence',
|
||||
'second_sequence')
|
||||
self.sequence_length: int = kwargs.pop('sequence_length', 128)
|
||||
self.tokenizer = PalmTokenizer.from_pretrained(model_dir)
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
@type_assert(object, str)
|
||||
def __call__(self, data: str) -> Dict[str, Any]:
|
||||
@@ -153,7 +161,7 @@ class TextGenerationPreprocessor(Preprocessor):
|
||||
new_data = {self.first_sequence: data}
|
||||
# preprocess the data for the model input
|
||||
|
||||
rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
|
||||
rst = {'input_ids': [], 'attention_mask': []}
|
||||
|
||||
max_seq_length = self.sequence_length
|
||||
|
||||
@@ -168,7 +176,6 @@ class TextGenerationPreprocessor(Preprocessor):
|
||||
|
||||
rst['input_ids'].append(feature['input_ids'])
|
||||
rst['attention_mask'].append(feature['attention_mask'])
|
||||
rst['token_type_ids'].append(feature['token_type_ids'])
|
||||
|
||||
return {k: torch.tensor(v) for k, v in rst.items()}
|
||||
|
||||
@@ -191,7 +198,6 @@ class ZeroShotClassificationPreprocessor(Preprocessor):
|
||||
self.sequence_length = kwargs.pop('sequence_length', 512)
|
||||
self.candidate_labels = kwargs.pop('candidate_labels')
|
||||
self.hypothesis_template = kwargs.pop('hypothesis_template', '{}')
|
||||
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
|
||||
|
||||
@type_assert(object, str)
|
||||
def __call__(self, data: str) -> Dict[str, Any]:
|
||||
@@ -216,3 +222,52 @@ class ZeroShotClassificationPreprocessor(Preprocessor):
|
||||
return_tensors='pt',
|
||||
truncation_strategy='only_first')
|
||||
return features
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
Fields.nlp, module_name=r'bert-token-classification')
|
||||
class TokenClassifcationPreprocessor(Preprocessor):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""preprocess the data via the vocab.txt from the `model_dir` path
|
||||
|
||||
Args:
|
||||
model_dir (str): model path
|
||||
"""
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
from sofa import SbertTokenizer
|
||||
self.model_dir: str = model_dir
|
||||
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
|
||||
|
||||
@type_assert(object, str)
|
||||
def __call__(self, data: str) -> Dict[str, Any]:
|
||||
"""process the raw input data
|
||||
|
||||
Args:
|
||||
data (str): a sentence
|
||||
Example:
|
||||
'you are so handsome.'
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: the preprocessed data
|
||||
"""
|
||||
|
||||
# preprocess the data for the model input
|
||||
|
||||
text = data.replace(' ', '').strip()
|
||||
tokens = []
|
||||
for token in text:
|
||||
token = self.tokenizer.tokenize(token)
|
||||
tokens.extend(token)
|
||||
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
|
||||
attention_mask = [1] * len(input_ids)
|
||||
token_type_ids = [0] * len(input_ids)
|
||||
return {
|
||||
'text': text,
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'token_type_ids': token_type_ids
|
||||
}
|
||||
|
||||
51
modelscope/preprocessors/text_to_speech.py
Normal file
51
modelscope/preprocessors/text_to_speech.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import io
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
from modelscope.fileio import File
|
||||
from modelscope.models.audio.tts.frontend import GenericTtsFrontend
|
||||
from modelscope.models.base import Model
|
||||
from modelscope.utils.audio.tts_exceptions import * # noqa F403
|
||||
from modelscope.utils.constant import Fields
|
||||
from .base import Preprocessor
|
||||
from .builder import PREPROCESSORS
|
||||
|
||||
__all__ = ['TextToTacotronSymbols', 'text_to_tacotron_symbols']
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
Fields.audio, module_name=r'text_to_tacotron_symbols')
|
||||
class TextToTacotronSymbols(Preprocessor):
|
||||
"""extract tacotron symbols from text.
|
||||
|
||||
Args:
|
||||
res_path (str): TTS frontend resource url
|
||||
lang_type (str): language type, valid values are "pinyin" and "chenmix"
|
||||
"""
|
||||
|
||||
def __init__(self, model_name, lang_type='pinyin'):
|
||||
self._frontend_model = Model.from_pretrained(
|
||||
model_name, lang_type=lang_type)
|
||||
assert self._frontend_model is not None, 'load model from pretained failed'
|
||||
|
||||
def __call__(self, data: str) -> Dict[str, Any]:
|
||||
"""Call functions to load text and get tacotron symbols.
|
||||
|
||||
Args:
|
||||
input (str): text with utf-8
|
||||
Returns:
|
||||
symbos (list[str]): texts in tacotron symbols format.
|
||||
"""
|
||||
return self._frontend_model.forward(data)
|
||||
|
||||
|
||||
def text_to_tacotron_symbols(text='', path='./', lang='pinyin'):
|
||||
""" simple interface to transform text to tacotron symbols
|
||||
|
||||
Args:
|
||||
text (str): input text
|
||||
path (str): resource path
|
||||
lang (str): language type from one of "pinyin" and "chenmix"
|
||||
"""
|
||||
transform = TextToTacotronSymbols(path, lang)
|
||||
return transform(text)
|
||||
22
modelscope/pydatasets/config.py
Normal file
22
modelscope/pydatasets/config.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Cache location
|
||||
DEFAULT_CACHE_HOME = '~/.cache'
|
||||
CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME)
|
||||
DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub')
|
||||
MS_CACHE_HOME = os.path.expanduser(
|
||||
os.getenv('MS_CACHE_HOME', DEFAULT_MS_CACHE_HOME))
|
||||
|
||||
DEFAULT_MS_DATASETS_CACHE = os.path.join(MS_CACHE_HOME, 'datasets')
|
||||
MS_DATASETS_CACHE = Path(
|
||||
os.getenv('MS_DATASETS_CACHE', DEFAULT_MS_DATASETS_CACHE))
|
||||
|
||||
DOWNLOADED_DATASETS_DIR = 'downloads'
|
||||
DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE,
|
||||
DOWNLOADED_DATASETS_DIR)
|
||||
DOWNLOADED_DATASETS_PATH = Path(
|
||||
os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))
|
||||
|
||||
MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
|
||||
'http://101.201.119.157:31752')
|
||||
@@ -1,64 +1,81 @@
|
||||
from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
|
||||
Union)
|
||||
import os
|
||||
from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
|
||||
Sequence, Union)
|
||||
|
||||
from datasets import Dataset, load_dataset
|
||||
import numpy as np
|
||||
from datasets import Dataset
|
||||
from datasets import load_dataset as hf_load_dataset
|
||||
from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
|
||||
from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
|
||||
from datasets.utils.file_utils import (is_relative_path,
|
||||
relative_to_absolute_path)
|
||||
|
||||
from modelscope.pydatasets.config import MS_DATASETS_CACHE
|
||||
from modelscope.pydatasets.utils.ms_api import MsApi
|
||||
from modelscope.utils.constant import Hubs
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def format_list(para) -> List:
|
||||
if para is None:
|
||||
para = []
|
||||
elif isinstance(para, str):
|
||||
para = [para]
|
||||
elif len(set(para)) < len(para):
|
||||
raise ValueError(f'List columns contains duplicates: {para}')
|
||||
return para
|
||||
|
||||
|
||||
class PyDataset:
|
||||
_hf_ds = None # holds the underlying HuggingFace Dataset
|
||||
"""A PyDataset backed by hugging face Dataset."""
|
||||
|
||||
def __init__(self, hf_ds: Dataset):
|
||||
def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
|
||||
self._hf_ds = hf_ds
|
||||
self.target = None
|
||||
self.target = target
|
||||
|
||||
def __iter__(self):
|
||||
if isinstance(self._hf_ds, Dataset):
|
||||
for item in self._hf_ds:
|
||||
if self.target is not None:
|
||||
yield item[self.target]
|
||||
else:
|
||||
yield item
|
||||
else:
|
||||
for ds in self._hf_ds.values():
|
||||
for item in ds:
|
||||
if self.target is not None:
|
||||
yield item[self.target]
|
||||
else:
|
||||
yield item
|
||||
for item in self._hf_ds:
|
||||
if self.target is not None:
|
||||
yield item[self.target]
|
||||
else:
|
||||
yield item
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._hf_ds[key]
|
||||
|
||||
@classmethod
|
||||
def from_hf_dataset(cls,
|
||||
hf_ds: Dataset,
|
||||
target: str = None) -> 'PyDataset':
|
||||
dataset = cls(hf_ds)
|
||||
dataset.target = target
|
||||
return dataset
|
||||
target: str = None) -> Union[dict, 'PyDataset']:
|
||||
if isinstance(hf_ds, Dataset):
|
||||
return cls(hf_ds, target)
|
||||
if len(hf_ds.keys()) == 1:
|
||||
return cls(next(iter(hf_ds.values())), target)
|
||||
return {k: cls(v, target) for k, v in hf_ds.items()}
|
||||
|
||||
@staticmethod
|
||||
def load(path: Union[str, list],
|
||||
target: Optional[str] = None,
|
||||
version: Optional[str] = None,
|
||||
name: Optional[str] = None,
|
||||
split: Optional[str] = None,
|
||||
data_dir: Optional[str] = None,
|
||||
data_files: Optional[Union[str, Sequence[str],
|
||||
Mapping[str,
|
||||
Union[str,
|
||||
Sequence[str]]]]] = None,
|
||||
hub: Optional[Hubs] = None) -> 'PyDataset':
|
||||
def load(
|
||||
dataset_name: Union[str, list],
|
||||
target: Optional[str] = None,
|
||||
version: Optional[str] = None,
|
||||
hub: Optional[Hubs] = Hubs.modelscope,
|
||||
subset_name: Optional[str] = None,
|
||||
split: Optional[str] = None,
|
||||
data_dir: Optional[str] = None,
|
||||
data_files: Optional[Union[str, Sequence[str],
|
||||
Mapping[str, Union[str,
|
||||
Sequence[str]]]]] = None
|
||||
) -> Union[dict, 'PyDataset']:
|
||||
"""Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
|
||||
Args:
|
||||
|
||||
path (str): Path or name of the dataset.
|
||||
dataset_name (str): Path or name of the dataset.
|
||||
target (str, optional): Name of the column to output.
|
||||
version (str, optional): Version of the dataset script to load:
|
||||
name (str, optional): Defining the subset_name of the dataset.
|
||||
subset_name (str, optional): Defining the subset_name of the dataset.
|
||||
data_dir (str, optional): Defining the data_dir of the dataset configuration. I
|
||||
data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
|
||||
split (str, optional): Which split of the data to load.
|
||||
@@ -67,53 +84,302 @@ class PyDataset:
|
||||
Returns:
|
||||
PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset.
|
||||
"""
|
||||
if Hubs.modelscope == hub:
|
||||
# TODO: parse data meta information from modelscope hub
|
||||
# and possibly download data files to local (and update path)
|
||||
print('getting data from modelscope hub')
|
||||
if isinstance(path, str):
|
||||
dataset = load_dataset(
|
||||
path,
|
||||
name=name,
|
||||
if hub == Hubs.huggingface:
|
||||
dataset = hf_load_dataset(
|
||||
dataset_name,
|
||||
name=subset_name,
|
||||
revision=version,
|
||||
split=split,
|
||||
data_dir=data_dir,
|
||||
data_files=data_files)
|
||||
elif isinstance(path, list):
|
||||
return PyDataset.from_hf_dataset(dataset, target=target)
|
||||
else:
|
||||
return PyDataset._load_ms_dataset(
|
||||
dataset_name,
|
||||
target=target,
|
||||
subset_name=subset_name,
|
||||
version=version,
|
||||
split=split,
|
||||
data_dir=data_dir,
|
||||
data_files=data_files)
|
||||
|
||||
@staticmethod
|
||||
def _load_ms_dataset(
|
||||
dataset_name: Union[str, list],
|
||||
target: Optional[str] = None,
|
||||
version: Optional[str] = None,
|
||||
subset_name: Optional[str] = None,
|
||||
split: Optional[str] = None,
|
||||
data_dir: Optional[str] = None,
|
||||
data_files: Optional[Union[str, Sequence[str],
|
||||
Mapping[str, Union[str,
|
||||
Sequence[str]]]]] = None
|
||||
) -> Union[dict, 'PyDataset']:
|
||||
if isinstance(dataset_name, str):
|
||||
use_hf = False
|
||||
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
|
||||
(os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
|
||||
use_hf = True
|
||||
elif is_relative_path(dataset_name):
|
||||
ms_api = MsApi()
|
||||
dataset_scripts = ms_api.fetch_dataset_scripts(
|
||||
dataset_name, version)
|
||||
if 'py' in dataset_scripts: # dataset copied from hf datasets
|
||||
dataset_name = dataset_scripts['py'][0]
|
||||
use_hf = True
|
||||
else:
|
||||
raise FileNotFoundError(
|
||||
f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
|
||||
f'or any data file in the same directory.')
|
||||
|
||||
if use_hf:
|
||||
dataset = hf_load_dataset(
|
||||
dataset_name,
|
||||
name=subset_name,
|
||||
revision=version,
|
||||
split=split,
|
||||
data_dir=data_dir,
|
||||
data_files=data_files,
|
||||
cache_dir=MS_DATASETS_CACHE)
|
||||
else:
|
||||
# TODO load from ms datahub
|
||||
raise NotImplementedError(
|
||||
f'Dataset {dataset_name} load from modelscope datahub to be implemented in '
|
||||
f'the future')
|
||||
elif isinstance(dataset_name, list):
|
||||
if target is None:
|
||||
target = 'target'
|
||||
dataset = Dataset.from_dict({target: [p] for p in path})
|
||||
dataset = Dataset.from_dict({target: dataset_name})
|
||||
else:
|
||||
raise TypeError('path must be a str or a list, but got'
|
||||
f' {type(path)}')
|
||||
f' {type(dataset_name)}')
|
||||
return PyDataset.from_hf_dataset(dataset, target=target)
|
||||
|
||||
def to_torch_dataset_with_processors(
|
||||
self,
|
||||
preprocessors: Union[Callable, List[Callable]],
|
||||
columns: Union[str, List[str]] = None,
|
||||
):
|
||||
preprocessor_list = preprocessors if isinstance(
|
||||
preprocessors, list) else [preprocessors]
|
||||
|
||||
columns = format_list(columns)
|
||||
|
||||
columns = [
|
||||
key for key in self._hf_ds.features.keys() if key in columns
|
||||
]
|
||||
sample = next(iter(self._hf_ds))
|
||||
|
||||
sample_res = {k: np.array(sample[k]) for k in columns}
|
||||
for processor in preprocessor_list:
|
||||
sample_res.update(
|
||||
{k: np.array(v)
|
||||
for k, v in processor(sample).items()})
|
||||
|
||||
def is_numpy_number(value):
|
||||
return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
|
||||
value.dtype, np.floating)
|
||||
|
||||
retained_columns = []
|
||||
for k in sample_res.keys():
|
||||
if not is_numpy_number(sample_res[k]):
|
||||
logger.warning(
|
||||
f'Data of column {k} is non-numeric, will be removed')
|
||||
continue
|
||||
retained_columns.append(k)
|
||||
|
||||
import torch
|
||||
|
||||
class MsIterableDataset(torch.utils.data.IterableDataset):
|
||||
|
||||
def __init__(self, dataset: Iterable):
|
||||
super(MsIterableDataset).__init__()
|
||||
self.dataset = dataset
|
||||
|
||||
def __iter__(self):
|
||||
for item_dict in self.dataset:
|
||||
res = {
|
||||
k: np.array(item_dict[k])
|
||||
for k in columns if k in retained_columns
|
||||
}
|
||||
for preprocessor in preprocessor_list:
|
||||
res.update({
|
||||
k: np.array(v)
|
||||
for k, v in preprocessor(item_dict).items()
|
||||
if k in retained_columns
|
||||
})
|
||||
yield res
|
||||
|
||||
return MsIterableDataset(self._hf_ds)
|
||||
|
||||
def to_torch_dataset(
|
||||
self,
|
||||
columns: Union[str, List[str]] = None,
|
||||
output_all_columns: bool = False,
|
||||
preprocessors: Union[Callable, List[Callable]] = None,
|
||||
**format_kwargs,
|
||||
):
|
||||
self._hf_ds.reset_format()
|
||||
self._hf_ds.set_format(
|
||||
type='torch',
|
||||
columns=columns,
|
||||
output_all_columns=output_all_columns,
|
||||
format_kwargs=format_kwargs)
|
||||
return self._hf_ds
|
||||
"""Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
|
||||
torch.utils.data.DataLoader.
|
||||
|
||||
Args:
|
||||
preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
|
||||
every sample of the dataset. The output type of processors is dict, and each numeric field of the dict
|
||||
will be used as a field of torch.utils.data.Dataset.
|
||||
columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
|
||||
preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
|
||||
the output fields of processors will also be added.
|
||||
format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.
|
||||
|
||||
Returns:
|
||||
:class:`tf.data.Dataset`
|
||||
|
||||
"""
|
||||
if not TORCH_AVAILABLE:
|
||||
raise ImportError(
|
||||
'The function to_torch_dataset requires pytorch to be installed'
|
||||
)
|
||||
if preprocessors is not None:
|
||||
return self.to_torch_dataset_with_processors(preprocessors)
|
||||
else:
|
||||
self._hf_ds.reset_format()
|
||||
self._hf_ds.set_format(
|
||||
type='torch', columns=columns, format_kwargs=format_kwargs)
|
||||
return self._hf_ds
|
||||
|
||||
def to_tf_dataset_with_processors(
|
||||
self,
|
||||
batch_size: int,
|
||||
shuffle: bool,
|
||||
preprocessors: Union[Callable, List[Callable]],
|
||||
drop_remainder: bool = None,
|
||||
prefetch: bool = True,
|
||||
label_cols: Union[str, List[str]] = None,
|
||||
columns: Union[str, List[str]] = None,
|
||||
):
|
||||
preprocessor_list = preprocessors if isinstance(
|
||||
preprocessors, list) else [preprocessors]
|
||||
|
||||
label_cols = format_list(label_cols)
|
||||
columns = format_list(columns)
|
||||
cols_to_retain = list(set(label_cols + columns))
|
||||
retained_columns = [
|
||||
key for key in self._hf_ds.features.keys() if key in cols_to_retain
|
||||
]
|
||||
import tensorflow as tf
|
||||
tf_dataset = tf.data.Dataset.from_tensor_slices(
|
||||
np.arange(len(self._hf_ds), dtype=np.int64))
|
||||
if shuffle:
|
||||
tf_dataset = tf_dataset.shuffle(buffer_size=len(self._hf_ds))
|
||||
|
||||
def func(i, return_dict=False):
|
||||
i = int(i)
|
||||
res = {k: np.array(self._hf_ds[i][k]) for k in retained_columns}
|
||||
for preprocessor in preprocessor_list:
|
||||
# TODO preprocessor output may have the same key
|
||||
res.update({
|
||||
k: np.array(v)
|
||||
for k, v in preprocessor(self._hf_ds[i]).items()
|
||||
})
|
||||
if return_dict:
|
||||
return res
|
||||
return tuple(list(res.values()))
|
||||
|
||||
sample_res = func(0, True)
|
||||
|
||||
@tf.function(input_signature=[tf.TensorSpec(None, tf.int64)])
|
||||
def fetch_function(i):
|
||||
output = tf.numpy_function(
|
||||
func,
|
||||
inp=[i],
|
||||
Tout=[
|
||||
tf.dtypes.as_dtype(val.dtype)
|
||||
for val in sample_res.values()
|
||||
],
|
||||
)
|
||||
return {key: output[i] for i, key in enumerate(sample_res)}
|
||||
|
||||
tf_dataset = tf_dataset.map(
|
||||
fetch_function, num_parallel_calls=tf.data.AUTOTUNE)
|
||||
if label_cols:
|
||||
|
||||
def split_features_and_labels(input_batch):
|
||||
labels = {
|
||||
key: tensor
|
||||
for key, tensor in input_batch.items() if key in label_cols
|
||||
}
|
||||
if len(input_batch) == 1:
|
||||
input_batch = next(iter(input_batch.values()))
|
||||
if len(labels) == 1:
|
||||
labels = next(iter(labels.values()))
|
||||
return input_batch, labels
|
||||
|
||||
tf_dataset = tf_dataset.map(split_features_and_labels)
|
||||
|
||||
elif len(columns) == 1:
|
||||
tf_dataset = tf_dataset.map(lambda x: next(iter(x.values())))
|
||||
if batch_size > 1:
|
||||
tf_dataset = tf_dataset.batch(
|
||||
batch_size, drop_remainder=drop_remainder)
|
||||
|
||||
if prefetch:
|
||||
tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)
|
||||
return tf_dataset
|
||||
|
||||
def to_tf_dataset(
|
||||
self,
|
||||
columns: Union[str, List[str]],
|
||||
batch_size: int,
|
||||
shuffle: bool,
|
||||
collate_fn: Callable,
|
||||
preprocessors: Union[Callable, List[Callable]] = None,
|
||||
columns: Union[str, List[str]] = None,
|
||||
collate_fn: Callable = None,
|
||||
drop_remainder: bool = None,
|
||||
collate_fn_args: Dict[str, Any] = None,
|
||||
label_cols: Union[str, List[str]] = None,
|
||||
dummy_labels: bool = False,
|
||||
prefetch: bool = True,
|
||||
):
|
||||
"""Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like
|
||||
model.fit() or model.predict().
|
||||
|
||||
Args:
|
||||
batch_size (int): Number of samples in a single batch.
|
||||
shuffle(bool): Shuffle the dataset order.
|
||||
preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
|
||||
every sample of the dataset. The output type of processors is dict, and each field of the dict will be
|
||||
used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn`
|
||||
shouldn't be None.
|
||||
columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None,
|
||||
the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of
|
||||
processors will also be added.
|
||||
collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If
|
||||
the `preprocessors` is None, the `collate_fn` shouldn't be None.
|
||||
drop_remainder(bool, default None): Drop the last incomplete batch when loading.
|
||||
collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`.
|
||||
label_cols (str or List[str], defalut None): Dataset column(s) to load as labels.
|
||||
prefetch (bool, default True): Prefetch data.
|
||||
|
||||
Returns:
|
||||
:class:`tf.data.Dataset`
|
||||
|
||||
"""
|
||||
if not TF_AVAILABLE:
|
||||
raise ImportError(
|
||||
'The function to_tf_dataset requires Tensorflow to be installed.'
|
||||
)
|
||||
if preprocessors is not None:
|
||||
return self.to_tf_dataset_with_processors(
|
||||
batch_size,
|
||||
shuffle,
|
||||
preprocessors,
|
||||
drop_remainder=drop_remainder,
|
||||
prefetch=prefetch,
|
||||
label_cols=label_cols,
|
||||
columns=columns)
|
||||
|
||||
if collate_fn is None:
|
||||
logger.error(
|
||||
'The `preprocessors` and the `collate_fn` should`t be both None.'
|
||||
)
|
||||
return None
|
||||
self._hf_ds.reset_format()
|
||||
return self._hf_ds.to_tf_dataset(
|
||||
columns,
|
||||
@@ -123,7 +389,6 @@ class PyDataset:
|
||||
drop_remainder=drop_remainder,
|
||||
collate_fn_args=collate_fn_args,
|
||||
label_cols=label_cols,
|
||||
dummy_labels=dummy_labels,
|
||||
prefetch=prefetch)
|
||||
|
||||
def to_hf_dataset(self) -> Dataset:
|
||||
|
||||
0
modelscope/pydatasets/utils/__init__.py
Normal file
0
modelscope/pydatasets/utils/__init__.py
Normal file
66
modelscope/pydatasets/utils/ms_api.py
Normal file
66
modelscope/pydatasets/utils/ms_api.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH,
|
||||
MS_HUB_ENDPOINT)
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class MsApi:
|
||||
|
||||
def __init__(self, endpoint=MS_HUB_ENDPOINT):
|
||||
self.endpoint = endpoint
|
||||
|
||||
def list_datasets(self):
|
||||
path = f'{self.endpoint}/api/v1/datasets'
|
||||
headers = None
|
||||
params = {}
|
||||
r = requests.get(path, params=params, headers=headers)
|
||||
r.raise_for_status()
|
||||
dataset_list = r.json()['Data']
|
||||
return [x['Name'] for x in dataset_list]
|
||||
|
||||
def fetch_dataset_scripts(self,
|
||||
dataset_name: str,
|
||||
version: Optional[str] = 'master',
|
||||
force_download=False):
|
||||
datahub_url = f'{self.endpoint}/api/v1/datasets?Query={dataset_name}'
|
||||
r = requests.get(datahub_url)
|
||||
r.raise_for_status()
|
||||
dataset_list = r.json()['Data']
|
||||
if len(dataset_list) == 0:
|
||||
return None
|
||||
dataset_id = dataset_list[0]['Id']
|
||||
version = version or 'master'
|
||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
|
||||
r = requests.get(datahub_url)
|
||||
r.raise_for_status()
|
||||
file_list = r.json()['Data']['Files']
|
||||
cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
|
||||
version)
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
local_paths = defaultdict(list)
|
||||
for file_info in file_list:
|
||||
file_path = file_info['Path']
|
||||
if file_path.endswith('.py'):
|
||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \
|
||||
f'Revision={version}&Path={file_path}'
|
||||
r = requests.get(datahub_url)
|
||||
r.raise_for_status()
|
||||
content = r.json()['Data']['Content']
|
||||
local_path = os.path.join(cache_dir, file_path)
|
||||
if os.path.exists(local_path) and not force_download:
|
||||
logger.warning(
|
||||
f"Reusing dataset {dataset_name}'s python file ({local_path})"
|
||||
)
|
||||
local_paths['py'].append(local_path)
|
||||
continue
|
||||
with open(local_path, 'w') as f:
|
||||
f.writelines(content)
|
||||
local_paths['py'].append(local_path)
|
||||
return local_paths
|
||||
0
modelscope/utils/audio/__init__.py
Normal file
0
modelscope/utils/audio/__init__.py
Normal file
42
modelscope/utils/audio/tts_exceptions.py
Normal file
42
modelscope/utils/audio/tts_exceptions.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Define TTS exceptions
|
||||
"""
|
||||
|
||||
|
||||
class TtsException(Exception):
|
||||
"""
|
||||
TTS exception class.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TtsFrontendException(TtsException):
|
||||
"""
|
||||
TTS frontend module level exceptions.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TtsFrontendInitializeFailedException(TtsFrontendException):
|
||||
"""
|
||||
If tts frontend resource is invalid or not exist, this exception will be raised.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TtsFrontendLanguageTypeInvalidException(TtsFrontendException):
|
||||
"""
|
||||
If language type is invalid, this exception will be raised.
|
||||
"""
|
||||
|
||||
|
||||
class TtsVocoderException(TtsException):
|
||||
"""
|
||||
Vocoder exception
|
||||
"""
|
||||
|
||||
|
||||
class TtsVocoderMelspecShapeMismatchException(TtsVocoderException):
|
||||
"""
|
||||
If vocoder's input melspec shape mismatch, this exception will be raised.
|
||||
"""
|
||||
@@ -28,9 +28,11 @@ class Tasks(object):
|
||||
image_editing = 'image-editing'
|
||||
image_generation = 'image-generation'
|
||||
image_matting = 'image-matting'
|
||||
ocr_detection = 'ocr-detection'
|
||||
|
||||
# nlp tasks
|
||||
zero_shot_classification = 'zero-shot-classification'
|
||||
word_segmentation = 'word-segmentation'
|
||||
sentiment_analysis = 'sentiment-analysis'
|
||||
sentence_similarity = 'sentence-similarity'
|
||||
text_classification = 'text-classification'
|
||||
|
||||
@@ -67,7 +67,6 @@ class Registry(object):
|
||||
if module_name in self._modules[group_key]:
|
||||
raise KeyError(f'{module_name} is already registered in '
|
||||
f'{self._name}[{group_key}]')
|
||||
|
||||
self._modules[group_key][module_name] = module_cls
|
||||
module_cls.group_key = group_key
|
||||
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
|
||||
|
||||
TEST_LEVEL = 2
|
||||
TEST_LEVEL_STR = 'TEST_LEVEL'
|
||||
@@ -15,6 +18,18 @@ def test_level():
|
||||
return TEST_LEVEL
|
||||
|
||||
|
||||
def require_tf(test_case):
|
||||
if not TF_AVAILABLE:
|
||||
test_case = unittest.skip('test requires TensorFlow')(test_case)
|
||||
return test_case
|
||||
|
||||
|
||||
def require_torch(test_case):
|
||||
if not TORCH_AVAILABLE:
|
||||
test_case = unittest.skip('test requires PyTorch')(test_case)
|
||||
return test_case
|
||||
|
||||
|
||||
def set_test_level(level: int):
|
||||
global TEST_LEVEL
|
||||
TEST_LEVEL = level
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
-r requirements/pipeline.txt
|
||||
-r requirements/multi-modal.txt
|
||||
-r requirements/nlp.txt
|
||||
-r requirements/audio.txt
|
||||
-r requirements/cv.txt
|
||||
|
||||
26
requirements/audio.txt
Normal file
26
requirements/audio.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
#tts
|
||||
h5py==2.10.0
|
||||
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp36-cp36m-linux_x86_64.whl
|
||||
https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp37-cp37m-linux_x86_64.whl
|
||||
https://swap.oss-cn-hangzhou.aliyuncs.com/Jiaqi%2Fmaas%2Ftts%2Frequirements%2Fpytorch_wavelets-1.3.0-py3-none-any.whl?Expires=1685688388&OSSAccessKeyId=LTAI4Ffebq4d9jTVDwiSbY4L&Signature=jcQbg5EZ%2Bdys3%2F4BRn3srrKLdIg%3D
|
||||
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp38-cp38-linux_x86_64.whl
|
||||
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp39-cp39-linux_x86_64.whl
|
||||
inflect
|
||||
keras==2.2.4
|
||||
librosa
|
||||
lxml
|
||||
matplotlib
|
||||
nara_wpe
|
||||
numpy==1.18.*
|
||||
protobuf==3.20.*
|
||||
ptflops
|
||||
PyWavelets>=1.0.0
|
||||
scikit-learn==0.23.2
|
||||
sox
|
||||
tensorboard
|
||||
tensorflow==1.15.*
|
||||
torch==1.10.*
|
||||
torchaudio
|
||||
torchvision
|
||||
tqdm
|
||||
unidecode
|
||||
@@ -1 +1,2 @@
|
||||
easydict
|
||||
tf_slim
|
||||
|
||||
@@ -1 +1 @@
|
||||
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.1.3-py3-none-any.whl
|
||||
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.2-py3-none-any.whl
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
addict
|
||||
datasets
|
||||
easydict
|
||||
https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.2.dev0-py3-none-any.whl
|
||||
https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl
|
||||
numpy
|
||||
opencv-python-headless
|
||||
Pillow>=6.2.0
|
||||
pyyaml
|
||||
requests
|
||||
scipy
|
||||
tokenizers<=0.10.3
|
||||
transformers<=4.16.2
|
||||
yapf
|
||||
|
||||
@@ -11,6 +11,7 @@ default_section = THIRDPARTY
|
||||
BASED_ON_STYLE = pep8
|
||||
BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
|
||||
SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
|
||||
SPLIT_BEFORE_ARITHMETIC_OPERATOR = true
|
||||
|
||||
[codespell]
|
||||
skip = *.ipynb
|
||||
@@ -20,5 +21,5 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
|
||||
[flake8]
|
||||
select = B,C,E,F,P,T4,W,B9
|
||||
max-line-length = 120
|
||||
ignore = F401,F821
|
||||
ignore = F401,F821,W503
|
||||
exclude = docs/src,*.pyi,.git
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user