mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-24 03:59:23 +01:00
space intent and modeling(generation) are ready
This commit is contained in:
3
.gitattributes
vendored
Normal file
3
.gitattributes
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
*.png filter=lfs diff=lfs merge=lfs -text
|
||||
*.jpg filter=lfs diff=lfs merge=lfs -text
|
||||
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -104,7 +104,6 @@ venv.bak/
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
data
|
||||
.vscode
|
||||
.idea
|
||||
|
||||
|
||||
67
Makefile.docker
Normal file
67
Makefile.docker
Normal file
@@ -0,0 +1,67 @@
|
||||
DOCKER_REGISTRY = registry.cn-shanghai.aliyuncs.com
|
||||
DOCKER_ORG = modelscope
|
||||
DOCKER_IMAGE = modelscope
|
||||
DOCKER_FULL_NAME = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
|
||||
|
||||
# CUDA_VERSION = 11.3
|
||||
# CUDNN_VERSION = 8
|
||||
BASE_RUNTIME = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
|
||||
# BASE_DEVEL = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
|
||||
BASE_DEVEL = pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
|
||||
|
||||
|
||||
MODELSCOPE_VERSION = $(shell git describe --tags --always)
|
||||
|
||||
# Can be either official / dev
|
||||
BUILD_TYPE = dev
|
||||
BUILD_PROGRESS = auto
|
||||
BUILD_ARGS = --build-arg BASE_IMAGE=$(BASE_IMAGE)
|
||||
|
||||
EXTRA_DOCKER_BUILD_FLAGS ?= --network=host
|
||||
# DOCKER_BUILD = DOCKER_BUILDKIT=1 \
|
||||
# docker build \
|
||||
# --progress=$(BUILD_PROGRESS) \
|
||||
# $(EXTRA_DOCKER_BUILD_FLAGS) \
|
||||
# --target $(BUILD_TYPE) \
|
||||
# -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
|
||||
# $(BUILD_ARGS) \
|
||||
# -f docker/pytorch.dockerfile .
|
||||
DOCKER_BUILD = DOCKER_BUILDKIT=1 \
|
||||
docker build \
|
||||
$(EXTRA_DOCKER_BUILD_FLAGS) \
|
||||
-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
|
||||
$(BUILD_ARGS) \
|
||||
-f docker/pytorch.dockerfile .
|
||||
DOCKER_PUSH = docker push $(DOCKER_FULL_NAME):$(DOCKER_TAG)
|
||||
|
||||
.PHONY: all
|
||||
all: devel-image
|
||||
|
||||
.PHONY: devel-image
|
||||
devel-image: BASE_IMAGE := $(BASE_DEVEL)
|
||||
devel-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
|
||||
devel-image:
|
||||
$(DOCKER_BUILD)
|
||||
|
||||
.PHONY: devel-push
|
||||
devel-push: BASE_IMAGE := $(BASE_DEVEL)
|
||||
devel-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
|
||||
devel-push:
|
||||
$(DOCKER_PUSH)
|
||||
|
||||
.PHONY: runtime-image
|
||||
runtime-image: BASE_IMAGE := $(BASE_RUNTIME)
|
||||
runtime-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
|
||||
runtime-image:
|
||||
$(DOCKER_BUILD)
|
||||
docker tag $(DOCKER_FULL_NAME):$(DOCKER_TAG) $(DOCKER_FULL_NAME):latest
|
||||
|
||||
.PHONY: runtime-push
|
||||
runtime-push: BASE_IMAGE := $(BASE_RUNTIME)
|
||||
runtime-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
|
||||
runtime-push:
|
||||
$(DOCKER_PUSH)
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
|
||||
3
data/test/images/image1.jpg
Normal file
3
data/test/images/image1.jpg
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:78094cc48fbcfd9b6d321fe13619ecc72b65e006fc1b4c4458409ade9979486d
|
||||
size 129862
|
||||
3
data/test/images/image_matting.png
Normal file
3
data/test/images/image_matting.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
|
||||
size 603621
|
||||
4
docker/.dockerignore
Normal file
4
docker/.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
||||
*.sh
|
||||
*.md
|
||||
*.dockerfile
|
||||
*.zip
|
||||
53
docker/pytorch.dockerfile
Normal file
53
docker/pytorch.dockerfile
Normal file
@@ -0,0 +1,53 @@
|
||||
# syntax = docker/dockerfile:experimental
|
||||
#
|
||||
# NOTE: To build this you will need a docker version > 18.06 with
|
||||
# experimental enabled and DOCKER_BUILDKIT=1
|
||||
#
|
||||
# If you do not use buildkit you are not going to have a good time
|
||||
#
|
||||
# For reference:
|
||||
# https://docs.docker.com/develop/develop-images/build_enhancements/
|
||||
|
||||
# ARG BASE_IMAGE=reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
|
||||
# FROM ${BASE_IMAGE} as dev-base
|
||||
|
||||
# FROM reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 as dev-base
|
||||
FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
|
||||
# FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
|
||||
# config pip source
|
||||
RUN mkdir /root/.pip
|
||||
COPY docker/rcfiles/pip.conf.tsinghua /root/.pip/pip.conf
|
||||
COPY docker/rcfiles/sources.list.aliyun /etc/apt/sources.list
|
||||
|
||||
# Install essential Ubuntu packages
|
||||
RUN apt-get update &&\
|
||||
apt-get install -y software-properties-common \
|
||||
build-essential \
|
||||
git \
|
||||
wget \
|
||||
vim \
|
||||
curl \
|
||||
zip \
|
||||
zlib1g-dev \
|
||||
unzip \
|
||||
pkg-config
|
||||
|
||||
# install modelscope and its python env
|
||||
WORKDIR /opt/modelscope
|
||||
COPY . .
|
||||
RUN pip install -r requirements.txt
|
||||
# RUN --mount=type=cache,target=/opt/ccache \
|
||||
# python setup.py install
|
||||
|
||||
# opencv-python-headless conflict with opencv-python installed
|
||||
RUN python setup.py install \
|
||||
&& pip uninstall -y opencv-python-headless
|
||||
|
||||
# prepare modelscope libs
|
||||
COPY docker/scripts/install_libs.sh /tmp/
|
||||
RUN bash /tmp/install_libs.sh && \
|
||||
rm -rf /tmp/install_libs.sh
|
||||
|
||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/modelscope/lib64
|
||||
|
||||
WORKDIR /workspace
|
||||
2
docker/rcfiles/pip.conf.tsinghua
Normal file
2
docker/rcfiles/pip.conf.tsinghua
Normal file
@@ -0,0 +1,2 @@
|
||||
[global]
|
||||
index-url=https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
25
docker/rcfiles/sources.list.aliyun
Normal file
25
docker/rcfiles/sources.list.aliyun
Normal file
@@ -0,0 +1,25 @@
|
||||
deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted
|
||||
|
||||
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted
|
||||
|
||||
deb http://mirrors.aliyun.com/ubuntu/ bionic universe
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic universe
|
||||
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates universe
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates universe
|
||||
|
||||
deb http://mirrors.aliyun.com/ubuntu/ bionic multiverse
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic multiverse
|
||||
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse
|
||||
|
||||
deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
|
||||
|
||||
deb http://mirrors.aliyun.com/ubuntu bionic-security main restricted
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu bionic-security main restricted
|
||||
deb http://mirrors.aliyun.com/ubuntu bionic-security universe
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu bionic-security universe
|
||||
deb http://mirrors.aliyun.com/ubuntu bionic-security multiverse
|
||||
# deb-src http://mirrors.aliyun.com/ubuntu bionic-security multiverse
|
||||
10
docker/rcfiles/user.vimrc
Normal file
10
docker/rcfiles/user.vimrc
Normal file
@@ -0,0 +1,10 @@
|
||||
set nocompatible
|
||||
set encoding=utf-8
|
||||
set hlsearch
|
||||
set smartindent
|
||||
set ruler
|
||||
set number
|
||||
set ts=2
|
||||
set sw=2
|
||||
set expandtab
|
||||
autocmd FileType make setlocal noexpandtab
|
||||
12
docker/scripts/install_libs.sh
Normal file
12
docker/scripts/install_libs.sh
Normal file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
ModelScopeLib=/usr/local/modelscope/lib64
|
||||
|
||||
if [ ! -d /usr/local/modelscope ]; then
|
||||
mkdir -p $ModelScopeLib
|
||||
fi
|
||||
|
||||
# audio libs
|
||||
wget "http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/libs/audio/libmitaec_pyio.so" -O ${ModelScopeLib}/libmitaec_pyio.so
|
||||
@@ -76,7 +76,7 @@ exclude_patterns = ['build', 'Thumbs.db', '.DS_Store']
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
html_theme = 'sphinx_book_theme'
|
||||
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
|
||||
html_theme_options = {}
|
||||
|
||||
|
||||
@@ -34,13 +34,111 @@ make linter
|
||||
```
|
||||
|
||||
## 2. Test
|
||||
### 2.1 Unit test
|
||||
|
||||
### 2.1 Test level
|
||||
|
||||
There are mainly three test levels:
|
||||
|
||||
* level 0: tests for basic interface and function of framework, such as `tests/trainers/test_trainer_base.py`
|
||||
* level 1: important functional test which test end2end workflow, such as `tests/pipelines/test_image_matting.py`
|
||||
* level 2: scenario tests for all the implemented modules such as model, pipeline in different algorithm filed.
|
||||
|
||||
Default test level is 0, which will only run those cases of level 0, you can set test level
|
||||
via environment variable `TEST_LEVEL`. For more details, you can refer to [test-doc](https://alidocs.dingtalk.com/i/nodes/mdvQnONayjBJKLXy1Bp38PY2MeXzp5o0?dontjump=true&nav=spaces&navQuery=spaceId%3Dnb9XJNlZxbgrOXyA)
|
||||
|
||||
|
||||
```bash
|
||||
# run all tests
|
||||
TEST_LEVEL=2 make test
|
||||
|
||||
# run important functional tests
|
||||
TEST_LEVEL=1 make test
|
||||
|
||||
# run core UT and basic functional tests
|
||||
make test
|
||||
```
|
||||
|
||||
### 2.2 Test data
|
||||
TODO
|
||||
When writing test cases, you should assign a test level for your test case using
|
||||
following code. If left default, the test level will be 0, it will run in each
|
||||
test stage.
|
||||
|
||||
File test_module.py
|
||||
```python
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
class ImageCartoonTest(unittest.TestCase):
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_by_direct_model_download(self):
|
||||
pass
|
||||
```
|
||||
|
||||
### 2.2 Run tests
|
||||
|
||||
1. Run your own single test case to test your self-implemented function. You can run your
|
||||
test file directly, if it fails to run, pls check if variable `TEST_LEVEL`
|
||||
exists in the environment and unset it.
|
||||
```bash
|
||||
python tests/path/to/your_test.py
|
||||
```
|
||||
|
||||
2. Remember to run core tests in local environment before start a codereview, by default it will
|
||||
only run test cases with level 0.
|
||||
```bash
|
||||
make tests
|
||||
```
|
||||
|
||||
3. After you start a code review, ci tests will be triggered which will run test cases with level 1
|
||||
|
||||
4. Daily regression tests will run all cases at 0 am each day using master branch.
|
||||
|
||||
### 2.3 Test data storage
|
||||
|
||||
As we need a lot of data for testing, including images, videos, models. We use git lfs
|
||||
to store those large files.
|
||||
|
||||
1. install git-lfs
|
||||
for mac
|
||||
```bash
|
||||
brew install git-lfs
|
||||
git lfs install
|
||||
```
|
||||
|
||||
for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
|
||||
```bash
|
||||
wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
|
||||
sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
|
||||
git lfs install
|
||||
```
|
||||
|
||||
for ubuntu
|
||||
```bash
|
||||
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
|
||||
sudo apt-get install git-lfs
|
||||
git lfs install
|
||||
```
|
||||
|
||||
2. track your data type using git lfs, for example, to track png files
|
||||
```bash
|
||||
git lfs track "*.png"
|
||||
```
|
||||
|
||||
3. add your test files to `data/test/` folder, you can make directories if you need.
|
||||
```bash
|
||||
git add data/test/test.png
|
||||
```
|
||||
|
||||
4. commit your test data to remote branch
|
||||
```bash
|
||||
git commit -m "xxx"
|
||||
```
|
||||
|
||||
To pull data from remote repo, just as the same way you pull git files.
|
||||
```bash
|
||||
git pull origin branch_name
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
## Code Review
|
||||
|
||||
@@ -93,3 +191,22 @@ TODO
|
||||
```bash
|
||||
make whl
|
||||
```
|
||||
|
||||
## Build docker
|
||||
|
||||
build develop docker
|
||||
```bash
|
||||
sudo make -f Makefile.docker devel-image
|
||||
```
|
||||
|
||||
push develop docker, passwd pls ask wenmeng.zwm
|
||||
```bash
|
||||
sudo docker login --username=mass_test@test.aliyunid.com registry.cn-shanghai.aliyuncs.com
|
||||
Password:
|
||||
sudo make -f Makefile.docker devel-push
|
||||
```
|
||||
|
||||
To build runtime image, just replace `devel` with `runtime` in the upper commands.
|
||||
```bash
|
||||
udo make -f Makefile.docker runtime-image runtime-push
|
||||
```
|
||||
|
||||
@@ -2,4 +2,4 @@
|
||||
|
||||
from .base import Model
|
||||
from .builder import MODELS, build_model
|
||||
from .nlp import BertForSequenceClassification
|
||||
from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity
|
||||
|
||||
0
modelscope/models/audio/__init__.py
Normal file
0
modelscope/models/audio/__init__.py
Normal file
0
modelscope/models/audio/layers/__init__.py
Normal file
0
modelscope/models/audio/layers/__init__.py
Normal file
60
modelscope/models/audio/layers/activations.py
Normal file
60
modelscope/models/audio/layers/activations.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import torch.nn as nn
|
||||
|
||||
from .layer_base import LayerBase
|
||||
|
||||
|
||||
class RectifiedLinear(LayerBase):
|
||||
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(RectifiedLinear, self).__init__()
|
||||
self.dim = input_dim
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
def forward(self, input):
|
||||
return self.relu(input)
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
|
||||
return re_str
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
return instr
|
||||
|
||||
|
||||
class LogSoftmax(LayerBase):
|
||||
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(LogSoftmax, self).__init__()
|
||||
self.dim = input_dim
|
||||
self.ls = nn.LogSoftmax()
|
||||
|
||||
def forward(self, input):
|
||||
return self.ls(input)
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<Softmax> %d %d\n' % (self.dim, self.dim)
|
||||
return re_str
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
return instr
|
||||
|
||||
|
||||
class Sigmoid(LayerBase):
|
||||
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(Sigmoid, self).__init__()
|
||||
self.dim = input_dim
|
||||
self.sig = nn.Sigmoid()
|
||||
|
||||
def forward(self, input):
|
||||
return self.sig(input)
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<Sigmoid> %d %d\n' % (self.dim, self.dim)
|
||||
return re_str
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
return instr
|
||||
78
modelscope/models/audio/layers/affine_transform.py
Normal file
78
modelscope/models/audio/layers/affine_transform.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
|
||||
from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
|
||||
to_kaldi_matrix)
|
||||
|
||||
|
||||
class AffineTransform(LayerBase):
|
||||
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(AffineTransform, self).__init__()
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.linear = nn.Linear(input_dim, output_dim)
|
||||
|
||||
def forward(self, input):
|
||||
return self.linear(input)
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
|
||||
self.input_dim)
|
||||
re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
return re_str
|
||||
|
||||
def to_raw_nnet(self, fid):
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LearnRateCoef>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('AffineTransform format error for <LearnRateCoef>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_token_number(instr, '<BiasLearnRateCoef>')
|
||||
if output is None:
|
||||
raise Exception(
|
||||
'AffineTransform format error for <BiasLearnRateCoef>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_token_number(instr, '<MaxNorm>')
|
||||
if output is None:
|
||||
raise Exception('AffineTransform format error for <MaxNorm>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('AffineTransform format error for parsing matrix')
|
||||
instr, mat = output
|
||||
|
||||
print(mat.shape)
|
||||
self.linear.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('AffineTransform format error for parsing matrix')
|
||||
instr, mat = output
|
||||
mat = np.squeeze(mat)
|
||||
self.linear.bias = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
return instr
|
||||
178
modelscope/models/audio/layers/deep_fsmn.py
Normal file
178
modelscope/models/audio/layers/deep_fsmn.py
Normal file
@@ -0,0 +1,178 @@
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
|
||||
to_kaldi_matrix)
|
||||
|
||||
|
||||
class DeepFsmn(LayerBase):
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=None,
|
||||
rorder=None,
|
||||
hidden_size=None,
|
||||
layer_norm=False,
|
||||
dropout=0):
|
||||
super(DeepFsmn, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
|
||||
if lorder is None:
|
||||
return
|
||||
|
||||
self.lorder = lorder
|
||||
self.rorder = rorder
|
||||
self.hidden_size = hidden_size
|
||||
self.layer_norm = layer_norm
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
self.norm = nn.LayerNorm(hidden_size)
|
||||
self.drop1 = nn.Dropout(p=dropout)
|
||||
self.drop2 = nn.Dropout(p=dropout)
|
||||
self.project = nn.Linear(hidden_size, output_dim, bias=False)
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [lorder, 1], [1, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
self.conv2 = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [rorder, 1], [1, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
|
||||
def forward(self, input):
|
||||
|
||||
f1 = F.relu(self.linear(input))
|
||||
|
||||
f1 = self.drop1(f1)
|
||||
if self.layer_norm:
|
||||
f1 = self.norm(f1)
|
||||
|
||||
p1 = self.project(f1)
|
||||
|
||||
x = th.unsqueeze(p1, 1)
|
||||
|
||||
x_per = x.permute(0, 3, 2, 1)
|
||||
|
||||
y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
|
||||
yr = F.pad(x_per, [0, 0, 0, self.rorder])
|
||||
yr = yr[:, :, 1:, :]
|
||||
|
||||
out = x_per + self.conv1(y) + self.conv2(yr)
|
||||
out = self.drop2(out)
|
||||
|
||||
out1 = out.permute(0, 3, 2, 1)
|
||||
|
||||
return input + out1.squeeze()
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<UniDeepFsmn> %d %d\n'\
|
||||
% (self.output_dim, self.input_dim)
|
||||
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n'\
|
||||
% (1, self.hidden_size, self.lorder, 1)
|
||||
lfiters = self.state_dict()['conv1.weight']
|
||||
x = np.flipud(lfiters.squeeze().numpy().T)
|
||||
re_str += to_kaldi_matrix(x)
|
||||
proj_weights = self.state_dict()['project.weight']
|
||||
x = proj_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
return re_str
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LearnRateCoef>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<HidSize>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <HidSize>')
|
||||
instr, hiddensize = output
|
||||
self.hidden_size = int(hiddensize)
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LOrder>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LOrder>')
|
||||
instr, lorder = output
|
||||
self.lorder = int(lorder)
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LStride>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LStride>')
|
||||
instr, lstride = output
|
||||
self.lstride = lstride
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<MaxNorm>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <MaxNorm>')
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
mat1 = np.fliplr(mat.T).copy()
|
||||
self.conv1 = nn.Conv2d(
|
||||
self.output_dim,
|
||||
self.output_dim, [self.lorder, 1], [1, 1],
|
||||
groups=self.output_dim,
|
||||
bias=False)
|
||||
mat_th = th.from_numpy(mat1).type(th.FloatTensor)
|
||||
mat_th = mat_th.unsqueeze(1)
|
||||
mat_th = mat_th.unsqueeze(3)
|
||||
self.conv1.weight = th.nn.Parameter(mat_th)
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
|
||||
self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
|
||||
self.linear = nn.Linear(self.input_dim, self.hidden_size)
|
||||
|
||||
self.project.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
self.linear.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
self.linear.bias = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
return instr
|
||||
50
modelscope/models/audio/layers/layer_base.py
Normal file
50
modelscope/models/audio/layers/layer_base.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import abc
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
def expect_token_number(instr, token):
|
||||
first_token = re.match(r'^\s*' + token, instr)
|
||||
if first_token is None:
|
||||
return None
|
||||
instr = instr[first_token.end():]
|
||||
lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr)
|
||||
if lr is None:
|
||||
return None
|
||||
return instr[lr.end():], lr.groups()[0]
|
||||
|
||||
|
||||
def expect_kaldi_matrix(instr):
|
||||
pos2 = instr.find('[', 0)
|
||||
pos3 = instr.find(']', pos2)
|
||||
mat = []
|
||||
for stt in instr[pos2 + 1:pos3].split('\n'):
|
||||
tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ')
|
||||
if tmp_mat.size > 0:
|
||||
mat.append(tmp_mat)
|
||||
return instr[pos3 + 1:], np.array(mat)
|
||||
|
||||
|
||||
def to_kaldi_matrix(np_mat):
|
||||
"""
|
||||
function that transform as str numpy mat to standard kaldi str matrix
|
||||
:param np_mat: numpy mat
|
||||
:return: str
|
||||
"""
|
||||
np.set_printoptions(threshold=np.inf, linewidth=np.nan, suppress=True)
|
||||
out_str = str(np_mat)
|
||||
out_str = out_str.replace('[', '')
|
||||
out_str = out_str.replace(']', '')
|
||||
return '[ %s ]\n' % out_str
|
||||
|
||||
|
||||
class LayerBase(nn.Module, metaclass=abc.ABCMeta):
|
||||
|
||||
def __init__(self):
|
||||
super(LayerBase, self).__init__()
|
||||
|
||||
@abc.abstractmethod
|
||||
def to_kaldi_nnet(self):
|
||||
pass
|
||||
482
modelscope/models/audio/layers/uni_deep_fsmn.py
Normal file
482
modelscope/models/audio/layers/uni_deep_fsmn.py
Normal file
@@ -0,0 +1,482 @@
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
|
||||
to_kaldi_matrix)
|
||||
|
||||
|
||||
class SepConv(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
filters,
|
||||
out_channels,
|
||||
kernel_size=(5, 2),
|
||||
dilation=(1, 1)):
|
||||
""" :param kernel_size (time, frequency)
|
||||
|
||||
"""
|
||||
super(SepConv, self).__init__()
|
||||
# depthwise + pointwise
|
||||
self.dconv = nn.Conv2d(
|
||||
in_channels,
|
||||
in_channels * filters,
|
||||
kernel_size,
|
||||
dilation=dilation,
|
||||
groups=in_channels)
|
||||
self.pconv = nn.Conv2d(
|
||||
in_channels * filters, out_channels, kernel_size=1)
|
||||
self.padding = dilation[0] * (kernel_size[0] - 1)
|
||||
|
||||
def forward(self, input):
|
||||
''' input: [B, C, T, F]
|
||||
'''
|
||||
x = F.pad(input, [0, 0, self.padding, 0])
|
||||
x = self.dconv(x)
|
||||
x = self.pconv(x)
|
||||
return x
|
||||
|
||||
|
||||
class Conv2d(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
groups=1,
|
||||
bias=False,
|
||||
skip_connect=True):
|
||||
super(Conv2d, self).__init__()
|
||||
self.lorder = lorder
|
||||
self.conv = nn.Conv2d(
|
||||
input_dim, output_dim, [lorder, 1], groups=groups, bias=bias)
|
||||
self.rorder = rorder
|
||||
if self.rorder:
|
||||
self.conv2 = nn.Conv2d(
|
||||
input_dim, output_dim, [rorder, 1], groups=groups, bias=bias)
|
||||
self.skip_connect = skip_connect
|
||||
|
||||
def forward(self, input):
|
||||
# [B, 1, T, F]
|
||||
x = th.unsqueeze(input, 1)
|
||||
# [B, F, T, 1]
|
||||
x_per = x.permute(0, 3, 2, 1)
|
||||
y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
|
||||
out = self.conv(y)
|
||||
if self.rorder:
|
||||
yr = F.pad(x_per, [0, 0, 0, self.rorder])
|
||||
yr = yr[:, :, 1:, :]
|
||||
out += self.conv2(yr)
|
||||
out = out.permute(0, 3, 2, 1).squeeze(1)
|
||||
if self.skip_connect:
|
||||
out = out + input
|
||||
return out
|
||||
|
||||
|
||||
class SelfAttLayer(nn.Module):
|
||||
|
||||
def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
|
||||
super(SelfAttLayer, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
|
||||
if lorder is None:
|
||||
return
|
||||
|
||||
self.lorder = lorder
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
|
||||
self.project = nn.Linear(hidden_size, output_dim, bias=False)
|
||||
|
||||
self.att = nn.Linear(input_dim, lorder, bias=False)
|
||||
|
||||
def forward(self, input):
|
||||
|
||||
f1 = F.relu(self.linear(input))
|
||||
|
||||
p1 = self.project(f1)
|
||||
|
||||
x = th.unsqueeze(p1, 1)
|
||||
|
||||
x_per = x.permute(0, 3, 2, 1)
|
||||
|
||||
y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
|
||||
|
||||
# z [B, F, T, lorder]
|
||||
z = x_per
|
||||
for i in range(1, self.lorder):
|
||||
z = th.cat([z, y[:, :, self.lorder - 1 - i:-i, :]], axis=-1)
|
||||
|
||||
# [B, T, lorder]
|
||||
att = F.softmax(self.att(input), dim=-1)
|
||||
att = th.unsqueeze(att, 1)
|
||||
z = th.sum(z * att, axis=-1)
|
||||
|
||||
out1 = z.permute(0, 2, 1)
|
||||
|
||||
return input + out1
|
||||
|
||||
|
||||
class TFFsmn(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=None,
|
||||
hidden_size=None,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
skip_connect=True):
|
||||
super(TFFsmn, self).__init__()
|
||||
|
||||
self.skip_connect = skip_connect
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
self.norm = nn.Identity()
|
||||
if layer_norm:
|
||||
self.norm = nn.LayerNorm(input_dim)
|
||||
self.act = nn.ReLU()
|
||||
self.project = nn.Linear(hidden_size, output_dim, bias=False)
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [lorder, 1],
|
||||
dilation=[dilation, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
self.padding_left = dilation * (lorder - 1)
|
||||
dorder = 5
|
||||
self.conv2 = nn.Conv2d(1, 1, [dorder, 1], bias=False)
|
||||
self.padding_freq = dorder - 1
|
||||
|
||||
def forward(self, input):
|
||||
return self.compute1(input)
|
||||
|
||||
def compute1(self, input):
|
||||
''' linear-dconv-relu(norm)-linear-dconv
|
||||
'''
|
||||
x = self.linear(input)
|
||||
# [B, 1, F, T]
|
||||
x = th.unsqueeze(x, 1).permute(0, 1, 3, 2)
|
||||
z = F.pad(x, [0, 0, self.padding_freq, 0])
|
||||
z = self.conv2(z) + x
|
||||
x = z.permute(0, 3, 2, 1).squeeze(-1)
|
||||
x = self.act(x)
|
||||
x = self.norm(x)
|
||||
x = self.project(x)
|
||||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
|
||||
# [B, F, T+lorder-1, 1]
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.conv1(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
|
||||
return input + out
|
||||
|
||||
|
||||
class CNNFsmn(nn.Module):
|
||||
''' use cnn to reduce parameters
|
||||
'''
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=None,
|
||||
hidden_size=None,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
skip_connect=True):
|
||||
super(CNNFsmn, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.skip_connect = skip_connect
|
||||
|
||||
if lorder is None:
|
||||
return
|
||||
|
||||
self.lorder = lorder
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
self.act = nn.ReLU()
|
||||
kernel_size = (3, 8)
|
||||
stride = (1, 4)
|
||||
self.conv = nn.Sequential(
|
||||
nn.ConstantPad2d((stride[1], 0, kernel_size[0] - 1, 0), 0),
|
||||
nn.Conv2d(1, stride[1], kernel_size=kernel_size, stride=stride))
|
||||
|
||||
self.dconv = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [lorder, 1],
|
||||
dilation=[dilation, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
self.padding_left = dilation * (lorder - 1)
|
||||
|
||||
def forward(self, input):
|
||||
return self.compute2(input)
|
||||
|
||||
def compute1(self, input):
|
||||
''' linear-relu(norm)-conv2d-relu?-dconv
|
||||
'''
|
||||
# [B, T, F]
|
||||
x = self.linear(input)
|
||||
x = self.act(x)
|
||||
x = th.unsqueeze(x, 1)
|
||||
x = self.conv(x)
|
||||
# [B, C, T, F] -> [B, 1, T, F]
|
||||
b, c, t, f = x.shape
|
||||
x = x.view([b, 1, t, -1])
|
||||
x = x.permute(0, 3, 2, 1)
|
||||
# [B, F, T+lorder-1, 1]
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.dconv(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
return input + out
|
||||
|
||||
def compute2(self, input):
|
||||
''' conv2d-relu-linear-relu?-dconv
|
||||
'''
|
||||
x = th.unsqueeze(input, 1)
|
||||
x = self.conv(x)
|
||||
x = self.act(x)
|
||||
# [B, C, T, F] -> [B, T, F]
|
||||
b, c, t, f = x.shape
|
||||
x = x.view([b, t, -1])
|
||||
x = self.linear(x)
|
||||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.dconv(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
return input + out
|
||||
|
||||
|
||||
class UniDeepFsmn(LayerBase):
|
||||
|
||||
def __init__(self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
lorder=None,
|
||||
hidden_size=None,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
skip_connect=True):
|
||||
super(UniDeepFsmn, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.skip_connect = skip_connect
|
||||
|
||||
if lorder is None:
|
||||
return
|
||||
|
||||
self.lorder = lorder
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.linear = nn.Linear(input_dim, hidden_size)
|
||||
self.norm = nn.Identity()
|
||||
if layer_norm:
|
||||
self.norm = nn.LayerNorm(input_dim)
|
||||
self.act = nn.ReLU()
|
||||
self.project = nn.Linear(hidden_size, output_dim, bias=False)
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
output_dim,
|
||||
output_dim, [lorder, 1],
|
||||
dilation=[dilation, 1],
|
||||
groups=output_dim,
|
||||
bias=False)
|
||||
self.padding_left = dilation * (lorder - 1)
|
||||
|
||||
def forward(self, input):
|
||||
return self.compute1(input)
|
||||
|
||||
def compute1(self, input):
|
||||
''' linear-relu(norm)-linear-dconv
|
||||
'''
|
||||
# [B, T, F]
|
||||
x = self.linear(input)
|
||||
x = self.act(x)
|
||||
x = self.norm(x)
|
||||
x = self.project(x)
|
||||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
|
||||
# [B, F, T+lorder-1, 1]
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.conv1(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
|
||||
return input + out
|
||||
|
||||
def compute2(self, input):
|
||||
''' linear-dconv-linear-relu(norm)
|
||||
'''
|
||||
x = self.project(input)
|
||||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.conv1(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
x = self.linear(out)
|
||||
x = self.act(x)
|
||||
x = self.norm(x)
|
||||
|
||||
return input + x
|
||||
|
||||
def compute3(self, input):
|
||||
''' dconv-linear-relu(norm)-linear
|
||||
'''
|
||||
x = th.unsqueeze(input, 1).permute(0, 3, 2, 1)
|
||||
y = F.pad(x, [0, 0, self.padding_left, 0])
|
||||
out = self.conv1(y)
|
||||
if self.skip_connect:
|
||||
out = out + x
|
||||
out = out.permute(0, 3, 2, 1).squeeze()
|
||||
x = self.linear(out)
|
||||
x = self.act(x)
|
||||
x = self.norm(x)
|
||||
x = self.project(x)
|
||||
|
||||
return input + x
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<UniDeepFsmn> %d %d\n' \
|
||||
% (self.output_dim, self.input_dim)
|
||||
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n' \
|
||||
% (1, self.hidden_size, self.lorder, 1)
|
||||
lfiters = self.state_dict()['conv1.weight']
|
||||
x = np.flipud(lfiters.squeeze().numpy().T)
|
||||
re_str += to_kaldi_matrix(x)
|
||||
proj_weights = self.state_dict()['project.weight']
|
||||
x = proj_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
re_str += to_kaldi_matrix(x)
|
||||
return re_str
|
||||
|
||||
def to_raw_nnet(self, fid):
|
||||
lfiters = self.state_dict()['conv1.weight']
|
||||
x = np.flipud(lfiters.squeeze().numpy().T)
|
||||
x.tofile(fid)
|
||||
|
||||
proj_weights = self.state_dict()['project.weight']
|
||||
x = proj_weights.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
linear_weights = self.state_dict()['linear.weight']
|
||||
x = linear_weights.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
linear_bias = self.state_dict()['linear.bias']
|
||||
x = linear_bias.squeeze().numpy()
|
||||
x.tofile(fid)
|
||||
|
||||
def load_kaldi_nnet(self, instr):
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LearnRateCoef>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
|
||||
instr, lr = output
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<HidSize>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <HidSize>')
|
||||
instr, hiddensize = output
|
||||
self.hidden_size = int(hiddensize)
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LOrder>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LOrder>')
|
||||
instr, lorder = output
|
||||
self.lorder = int(lorder)
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<LStride>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <LStride>')
|
||||
instr, lstride = output
|
||||
self.lstride = lstride
|
||||
|
||||
output = expect_token_number(
|
||||
instr,
|
||||
'<MaxNorm>',
|
||||
)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for <MaxNorm>')
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
mat1 = np.fliplr(mat.T).copy()
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
self.output_dim,
|
||||
self.output_dim, [self.lorder, 1], [1, 1],
|
||||
groups=self.output_dim,
|
||||
bias=False)
|
||||
|
||||
mat_th = th.from_numpy(mat1).type(th.FloatTensor)
|
||||
mat_th = mat_th.unsqueeze(1)
|
||||
mat_th = mat_th.unsqueeze(3)
|
||||
self.conv1.weight = th.nn.Parameter(mat_th)
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
|
||||
self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
|
||||
self.linear = nn.Linear(self.input_dim, self.hidden_size)
|
||||
|
||||
self.project.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
self.linear.weight = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
output = expect_kaldi_matrix(instr)
|
||||
if output is None:
|
||||
raise Exception('UniDeepFsmn format error for parsing matrix')
|
||||
instr, mat = output
|
||||
mat = np.squeeze(mat)
|
||||
self.linear.bias = th.nn.Parameter(
|
||||
th.from_numpy(mat).type(th.FloatTensor))
|
||||
|
||||
return instr
|
||||
0
modelscope/models/audio/network/__init__.py
Normal file
0
modelscope/models/audio/network/__init__.py
Normal file
394
modelscope/models/audio/network/loss.py
Normal file
394
modelscope/models/audio/network/loss.py
Normal file
@@ -0,0 +1,394 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .modulation_loss import (GaborSTRFConv, MelScale,
|
||||
ModulationDomainLossModule)
|
||||
|
||||
EPS = 1e-8
|
||||
|
||||
|
||||
def compute_mask(mixed_spec, clean_spec, mask_type='psmiam', clip=1):
|
||||
'''
|
||||
stft: (batch, ..., 2) or complex(batch, ...)
|
||||
y = x + n
|
||||
'''
|
||||
if torch.is_complex(mixed_spec):
|
||||
yr, yi = mixed_spec.real, mixed_spec.imag
|
||||
else:
|
||||
yr, yi = mixed_spec[..., 0], mixed_spec[..., 1]
|
||||
if torch.is_complex(clean_spec):
|
||||
xr, xi = clean_spec.real, clean_spec.imag
|
||||
else:
|
||||
xr, xi = clean_spec[..., 0], clean_spec[..., 1]
|
||||
|
||||
if mask_type == 'iam':
|
||||
ymag = torch.sqrt(yr**2 + yi**2)
|
||||
xmag = torch.sqrt(xr**2 + xi**2)
|
||||
iam = xmag / (ymag + EPS)
|
||||
return torch.clamp(iam, 0, 1)
|
||||
|
||||
elif mask_type == 'psm':
|
||||
ypow = yr**2 + yi**2
|
||||
psm = (xr * yr + xi * yi) / (ypow + EPS)
|
||||
return torch.clamp(psm, 0, 1)
|
||||
|
||||
elif mask_type == 'psmiam':
|
||||
ypow = yr**2 + yi**2
|
||||
psm = (xr * yr + xi * yi) / (ypow + EPS)
|
||||
ymag = torch.sqrt(yr**2 + yi**2)
|
||||
xmag = torch.sqrt(xr**2 + xi**2)
|
||||
iam = xmag / (ymag + EPS)
|
||||
psmiam = psm * iam
|
||||
return torch.clamp(psmiam, 0, 1)
|
||||
|
||||
elif mask_type == 'crm':
|
||||
ypow = yr**2 + yi**2
|
||||
mr = (xr * yr + xi * yi) / (ypow + EPS)
|
||||
mi = (xi * yr - xr * yi) / (ypow + EPS)
|
||||
mr = torch.clamp(mr, -clip, clip)
|
||||
mi = torch.clamp(mi, -clip, clip)
|
||||
return mr, mi
|
||||
|
||||
|
||||
def energy_vad(spec,
|
||||
thdhigh=320 * 600 * 600 * 2,
|
||||
thdlow=320 * 300 * 300 * 2,
|
||||
int16=True):
|
||||
'''
|
||||
energy based vad should be accurate enough
|
||||
spec: (batch, bins, frames, 2)
|
||||
returns (batch, frames)
|
||||
'''
|
||||
energy = torch.sum(spec[..., 0]**2 + spec[..., 1]**2, dim=1)
|
||||
vad = energy > thdhigh
|
||||
idx = torch.logical_and(vad == 0, energy > thdlow)
|
||||
vad[idx] = 0.5
|
||||
return vad
|
||||
|
||||
|
||||
def modulation_loss_init(n_fft):
|
||||
gabor_strf_parameters = torch.load(
|
||||
'./network/gabor_strf_parameters.pt')['state_dict']
|
||||
gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60)
|
||||
gabor_modulation_kernels.load_state_dict(gabor_strf_parameters)
|
||||
|
||||
modulation_loss_module = ModulationDomainLossModule(
|
||||
gabor_modulation_kernels.eval())
|
||||
for param in modulation_loss_module.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
stft2mel = MelScale(
|
||||
n_mels=80, sample_rate=16000, n_stft=n_fft // 2 + 1).cuda()
|
||||
|
||||
return modulation_loss_module, stft2mel
|
||||
|
||||
|
||||
def mask_loss_function(
|
||||
loss_func='psm_loss',
|
||||
loss_type='mse', # ['mse', 'mae', 'comb']
|
||||
mask_type='psmiam',
|
||||
use_mod_loss=False,
|
||||
use_wav2vec_loss=False,
|
||||
n_fft=640,
|
||||
hop_length=320,
|
||||
EPS=1e-8,
|
||||
weight=None):
|
||||
if weight is not None:
|
||||
print(f'Use loss weight: {weight}')
|
||||
winlen = n_fft
|
||||
window = torch.hamming_window(winlen, periodic=False)
|
||||
|
||||
def stft(x, return_complex=False):
|
||||
# returns [batch, bins, frames, 2]
|
||||
return torch.stft(
|
||||
x,
|
||||
n_fft,
|
||||
hop_length,
|
||||
winlen,
|
||||
window=window.to(x.device),
|
||||
center=False,
|
||||
return_complex=return_complex)
|
||||
|
||||
def istft(x, slen):
|
||||
return torch.istft(
|
||||
x,
|
||||
n_fft,
|
||||
hop_length,
|
||||
winlen,
|
||||
window=window.to(x.device),
|
||||
center=False,
|
||||
length=slen)
|
||||
|
||||
def mask_loss(targets, masks, nframes):
|
||||
''' [Batch, Time, Frequency]
|
||||
'''
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(targets)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
masks = masks * mask_for_loss
|
||||
targets = targets * mask_for_loss
|
||||
|
||||
if weight is None:
|
||||
alpha = 1
|
||||
else: # for aec ST
|
||||
alpha = weight - targets
|
||||
|
||||
if loss_type == 'mse':
|
||||
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2))
|
||||
elif loss_type == 'mae':
|
||||
loss = torch.sum(alpha * torch.abs(targets - masks))
|
||||
else: # mse(mask), mae(mask) approx 1:2
|
||||
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2)
|
||||
+ 0.1 * alpha * torch.abs(targets - masks))
|
||||
loss /= torch.sum(nframes)
|
||||
return loss
|
||||
|
||||
def spectrum_loss(targets, spec, nframes):
|
||||
''' [Batch, Time, Frequency, 2]
|
||||
'''
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(targets[..., 0])
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
xr = spec[..., 0] * mask_for_loss
|
||||
xi = spec[..., 1] * mask_for_loss
|
||||
yr = targets[..., 0] * mask_for_loss
|
||||
yi = targets[..., 1] * mask_for_loss
|
||||
xmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) * mask_for_loss
|
||||
ymag = torch.sqrt(targets[..., 0]**2
|
||||
+ targets[..., 1]**2) * mask_for_loss
|
||||
|
||||
loss1 = torch.sum(torch.pow(xr - yr, 2) + torch.pow(xi - yi, 2))
|
||||
loss2 = torch.sum(torch.pow(xmag - ymag, 2))
|
||||
|
||||
loss = (loss1 + loss2) / torch.sum(nframes)
|
||||
return loss
|
||||
|
||||
def sa_loss_dlen(mixed, clean, masks, nframes):
|
||||
yspec = stft(mixed).permute([0, 2, 1, 3]) / 32768
|
||||
xspec = stft(clean).permute([0, 2, 1, 3]) / 32768
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(xspec[..., 0])
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
emag = ((yspec[..., 0]**2 + yspec[..., 1]**2)**0.15) * (masks**0.3)
|
||||
xmag = (xspec[..., 0]**2 + xspec[..., 1]**2)**0.15
|
||||
emag = emag * mask_for_loss
|
||||
xmag = xmag * mask_for_loss
|
||||
|
||||
loss = torch.sum(torch.pow(emag - xmag, 2)) / torch.sum(nframes)
|
||||
return loss
|
||||
|
||||
def psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed_spec = stft(mixed)
|
||||
clean_spec = stft(clean)
|
||||
targets = compute_mask(mixed_spec, clean_spec, mask_type)
|
||||
# [B, T, F]
|
||||
targets = targets.permute(0, 2, 1)
|
||||
|
||||
loss = mask_loss(targets, masks, nframes)
|
||||
|
||||
if subtask is not None:
|
||||
vadtargets = energy_vad(clean_spec)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(targets[:, :, 0])
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:] = 0
|
||||
subtask = subtask[:, :, 0] * mask_for_loss
|
||||
vadtargets = vadtargets * mask_for_loss
|
||||
|
||||
loss_vad = F.binary_cross_entropy(subtask, vadtargets)
|
||||
return loss + loss_vad
|
||||
return loss
|
||||
|
||||
def modulation_loss(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed_spec = stft(mixed, True)
|
||||
clean_spec = stft(clean, True)
|
||||
enhanced_mag = torch.abs(mixed_spec)
|
||||
clean_mag = torch.abs(clean_spec)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(clean_mag)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, :, num:] = 0
|
||||
clean_mag = clean_mag * mask_for_loss
|
||||
enhanced_mag = enhanced_mag * mask_for_loss * masks.permute([0, 2, 1])
|
||||
|
||||
# Covert to log-mel representation
|
||||
# (B,T,#mel_channels)
|
||||
clean_log_mel = torch.log(
|
||||
torch.transpose(stft2mel(clean_mag**2), 2, 1) + 1e-8)
|
||||
enhanced_log_mel = torch.log(
|
||||
torch.transpose(stft2mel(enhanced_mag**2), 2, 1) + 1e-8)
|
||||
|
||||
alpha = compute_mask(mixed_spec, clean_spec, mask_type)
|
||||
alpha = alpha.permute(0, 2, 1)
|
||||
loss = 0.05 * modulation_loss_module(enhanced_log_mel, clean_log_mel,
|
||||
alpha)
|
||||
loss2 = psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask)
|
||||
# print(loss.item(), loss2.item()) #approx 1:4
|
||||
loss = loss + loss2
|
||||
return loss
|
||||
|
||||
def wav2vec_loss(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed /= 32768
|
||||
clean /= 32768
|
||||
mixed_spec = stft(mixed)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(masks)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
masks_est = masks * mask_for_loss
|
||||
|
||||
estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
|
||||
est_clean = istft(estimate, clean.shape[1])
|
||||
loss = wav2vec_loss_module(est_clean, clean)
|
||||
return loss
|
||||
|
||||
def sisdr_loss_dlen(mixed,
|
||||
clean,
|
||||
masks,
|
||||
nframes,
|
||||
subtask=None,
|
||||
zero_mean=True):
|
||||
mixed_spec = stft(mixed)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(masks)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
masks_est = masks * mask_for_loss
|
||||
|
||||
estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
|
||||
est_clean = istft(estimate, clean.shape[1])
|
||||
flen = min(clean.shape[1], est_clean.shape[1])
|
||||
clean = clean[:, :flen]
|
||||
est_clean = est_clean[:, :flen]
|
||||
|
||||
# follow asteroid/losses/sdr.py
|
||||
if zero_mean:
|
||||
clean = clean - torch.mean(clean, dim=1, keepdim=True)
|
||||
est_clean = est_clean - torch.mean(est_clean, dim=1, keepdim=True)
|
||||
|
||||
dot = torch.sum(est_clean * clean, dim=1, keepdim=True)
|
||||
s_clean_energy = torch.sum(clean**2, dim=1, keepdim=True) + EPS
|
||||
scaled_clean = dot * clean / s_clean_energy
|
||||
e_noise = est_clean - scaled_clean
|
||||
|
||||
# [batch]
|
||||
sisdr = torch.sum(
|
||||
scaled_clean**2, dim=1) / (
|
||||
torch.sum(e_noise**2, dim=1) + EPS)
|
||||
sisdr = -10 * torch.log10(sisdr + EPS)
|
||||
loss = sisdr.mean()
|
||||
return loss
|
||||
|
||||
def sisdr_freq_loss_dlen(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed_spec = stft(mixed)
|
||||
clean_spec = stft(clean)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(masks)
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
masks_est = masks * mask_for_loss
|
||||
|
||||
estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
|
||||
|
||||
dot_real = estimate[..., 0] * clean_spec[..., 0] + \
|
||||
estimate[..., 1] * clean_spec[..., 1]
|
||||
dot_imag = estimate[..., 0] * clean_spec[..., 1] - \
|
||||
estimate[..., 1] * clean_spec[..., 0]
|
||||
dot = torch.cat([dot_real.unsqueeze(3), dot_imag.unsqueeze(3)], dim=-1)
|
||||
s_clean_energy = clean_spec[..., 0] ** 2 + \
|
||||
clean_spec[..., 1] ** 2 + EPS
|
||||
scaled_clean = dot * clean_spec / s_clean_energy.unsqueeze(3)
|
||||
e_noise = estimate - scaled_clean
|
||||
|
||||
# [batch]
|
||||
scaled_clean_energy = torch.sum(
|
||||
scaled_clean[..., 0]**2 + scaled_clean[..., 1]**2, dim=1)
|
||||
e_noise_energy = torch.sum(
|
||||
e_noise[..., 0]**2 + e_noise[..., 1]**2, dim=1)
|
||||
sisdr = torch.sum(
|
||||
scaled_clean_energy, dim=1) / (
|
||||
torch.sum(e_noise_energy, dim=1) + EPS)
|
||||
sisdr = -10 * torch.log10(sisdr + EPS)
|
||||
loss = sisdr.mean()
|
||||
return loss
|
||||
|
||||
def crm_loss_dlen(mixed, clean, masks, nframes, subtask=None):
|
||||
mixed_spec = stft(mixed).permute([0, 2, 1, 3])
|
||||
clean_spec = stft(clean).permute([0, 2, 1, 3])
|
||||
mixed_spec = mixed_spec / 32768
|
||||
clean_spec = clean_spec / 32768
|
||||
tgt_mr, tgt_mi = compute_mask(mixed_spec, clean_spec, mask_type='crm')
|
||||
|
||||
D = int(masks.shape[2] / 2)
|
||||
with torch.no_grad():
|
||||
mask_for_loss = torch.ones_like(clean_spec[..., 0])
|
||||
for idx, num in enumerate(nframes):
|
||||
mask_for_loss[idx, num:, :] = 0
|
||||
mr = masks[..., :D] * mask_for_loss
|
||||
mi = masks[..., D:] * mask_for_loss
|
||||
tgt_mr = tgt_mr * mask_for_loss
|
||||
tgt_mi = tgt_mi * mask_for_loss
|
||||
|
||||
if weight is None:
|
||||
alpha = 1
|
||||
else:
|
||||
alpha = weight - tgt_mr
|
||||
# signal approximation
|
||||
yr = mixed_spec[..., 0]
|
||||
yi = mixed_spec[..., 1]
|
||||
loss1 = torch.sum(alpha * torch.pow((mr * yr - mi * yi) - clean_spec[..., 0], 2)) \
|
||||
+ torch.sum(alpha * torch.pow((mr * yi + mi * yr) - clean_spec[..., 1], 2))
|
||||
# mask approximation
|
||||
loss2 = torch.sum(alpha * torch.pow(mr - tgt_mr, 2)) \
|
||||
+ torch.sum(alpha * torch.pow(mi - tgt_mi, 2))
|
||||
loss = 0.5 * (loss1 + loss2) / torch.sum(nframes)
|
||||
return loss
|
||||
|
||||
def crm_miso_loss_dlen(mixed, clean, masks, nframes):
|
||||
return crm_loss_dlen(mixed[..., 0], clean[..., 0], masks, nframes)
|
||||
|
||||
def mimo_loss_dlen(mixed, clean, masks, nframes):
|
||||
chs = mixed.shape[-1]
|
||||
D = masks.shape[2] // chs
|
||||
loss = psm_vad_loss_dlen(mixed[..., 0], clean[..., 0], masks[..., :D],
|
||||
nframes)
|
||||
for ch in range(1, chs):
|
||||
loss1 = psm_vad_loss_dlen(mixed[..., ch], clean[..., ch],
|
||||
masks[..., ch * D:ch * D + D], nframes)
|
||||
loss = loss + loss1
|
||||
return loss / chs
|
||||
|
||||
def spec_loss_dlen(mixed, clean, spec, nframes):
|
||||
clean_spec = stft(clean).permute([0, 2, 1, 3])
|
||||
clean_spec = clean_spec / 32768
|
||||
|
||||
D = spec.shape[2] // 2
|
||||
spec_est = torch.cat([spec[..., :D, None], spec[..., D:, None]],
|
||||
dim=-1)
|
||||
loss = spectrum_loss(clean_spec, spec_est, nframes)
|
||||
return loss
|
||||
|
||||
if loss_func == 'psm_vad_loss_dlen':
|
||||
return psm_vad_loss_dlen
|
||||
elif loss_func == 'sisdr_loss_dlen':
|
||||
return sisdr_loss_dlen
|
||||
elif loss_func == 'sisdr_freq_loss_dlen':
|
||||
return sisdr_freq_loss_dlen
|
||||
elif loss_func == 'crm_loss_dlen':
|
||||
return crm_loss_dlen
|
||||
elif loss_func == 'modulation_loss':
|
||||
return modulation_loss
|
||||
elif loss_func == 'wav2vec_loss':
|
||||
return wav2vec_loss
|
||||
elif loss_func == 'mimo_loss_dlen':
|
||||
return mimo_loss_dlen
|
||||
elif loss_func == 'spec_loss_dlen':
|
||||
return spec_loss_dlen
|
||||
elif loss_func == 'sa_loss_dlen':
|
||||
return sa_loss_dlen
|
||||
else:
|
||||
print('error loss func')
|
||||
return None
|
||||
248
modelscope/models/audio/network/modulation_loss.py
Normal file
248
modelscope/models/audio/network/modulation_loss.py
Normal file
@@ -0,0 +1,248 @@
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torchaudio.transforms import MelScale
|
||||
|
||||
|
||||
class ModulationDomainLossModule(torch.nn.Module):
|
||||
"""Modulation-domain loss function developed in [1] for supervised speech enhancement
|
||||
|
||||
In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
|
||||
as the input spectrogram representation.
|
||||
Specific parameter details are in the paper and in the example below
|
||||
|
||||
Parameters
|
||||
----------
|
||||
modulation_kernels: nn.Module
|
||||
Differentiable module that transforms a spectrogram representation to the modulation domain
|
||||
|
||||
modulation_domain = modulation_kernels(input_tf_representation)
|
||||
Input Spectrogram representation (B, T, F) ---> |(M) modulation_kernels|--->Modulation Domain(B, M, T', F')
|
||||
|
||||
norm: boolean
|
||||
Normalizes the modulation domain representation to be 0 mean across time
|
||||
|
||||
[1] T. Vuong, Y. Xia, and R. M. Stern, “A modulation-domain lossfor neural-network-based real-time
|
||||
speech enhancement”
|
||||
Accepted ICASSP 2021, https://arxiv.org/abs/2102.07330
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, modulation_kernels, norm=True):
|
||||
super(ModulationDomainLossModule, self).__init__()
|
||||
|
||||
self.modulation_kernels = modulation_kernels
|
||||
self.mse = nn.MSELoss(reduce=False)
|
||||
self.norm = norm
|
||||
|
||||
def forward(self, enhanced_spect, clean_spect, weight=None):
|
||||
"""Calculate modulation-domain loss
|
||||
Args:
|
||||
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
|
||||
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
|
||||
Returns:
|
||||
Tensor: Modulation-domain loss value.
|
||||
"""
|
||||
|
||||
clean_mod = self.modulation_kernels(clean_spect)
|
||||
enhanced_mod = self.modulation_kernels(enhanced_spect)
|
||||
|
||||
if self.norm:
|
||||
mean_clean_mod = torch.mean(clean_mod, dim=2)
|
||||
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)
|
||||
|
||||
clean_mod = clean_mod - mean_clean_mod.unsqueeze(2)
|
||||
enhanced_mod = enhanced_mod - mean_enhanced_mod.unsqueeze(2)
|
||||
|
||||
if weight is None:
|
||||
alpha = 1
|
||||
else: # TF-mask weight
|
||||
alpha = 1 + torch.sum(weight, dim=-1, keepdim=True).unsqueeze(1)
|
||||
mod_mse_loss = self.mse(enhanced_mod, clean_mod) * alpha
|
||||
mod_mse_loss = torch.mean(
|
||||
torch.sum(mod_mse_loss, dim=(1, 2, 3))
|
||||
/ torch.sum(clean_mod**2, dim=(1, 2, 3)))
|
||||
|
||||
return mod_mse_loss
|
||||
|
||||
|
||||
class ModulationDomainNCCLossModule(torch.nn.Module):
|
||||
"""Modulation-domain loss function developed in [1] for supervised speech enhancement
|
||||
|
||||
# Speech Intelligibility Prediction Using Spectro-Temporal Modulation Analysis - based off of this
|
||||
|
||||
In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
|
||||
as the input spectrogram representation.
|
||||
Specific parameter details are in the paper and in the example below
|
||||
|
||||
Parameters
|
||||
----------
|
||||
modulation_kernels: nn.Module
|
||||
Differentiable module that transforms a spectrogram representation to the modulation domain
|
||||
|
||||
modulation_domain = modulation_kernels(input_tf_representation)
|
||||
Input Spectrogram representation(B, T, F) --- (M) modulation_kernels---> Modulation Domain(B, M, T', F')
|
||||
|
||||
[1]
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, modulation_kernels):
|
||||
super(ModulationDomainNCCLossModule, self).__init__()
|
||||
|
||||
self.modulation_kernels = modulation_kernels
|
||||
self.mse = nn.MSELoss(reduce=False)
|
||||
|
||||
def forward(self, enhanced_spect, clean_spect):
|
||||
"""Calculate modulation-domain loss
|
||||
Args:
|
||||
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
|
||||
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
|
||||
Returns:
|
||||
Tensor: Modulation-domain loss value.
|
||||
"""
|
||||
|
||||
clean_mod = self.modulation_kernels(clean_spect)
|
||||
enhanced_mod = self.modulation_kernels(enhanced_spect)
|
||||
mean_clean_mod = torch.mean(clean_mod, dim=2)
|
||||
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)
|
||||
|
||||
normalized_clean = clean_mod - mean_clean_mod.unsqueeze(2)
|
||||
normalized_enhanced = enhanced_mod - mean_enhanced_mod.unsqueeze(2)
|
||||
|
||||
inner_product = torch.sum(
|
||||
normalized_clean * normalized_enhanced, dim=2)
|
||||
normalized_denom = (torch.sum(
|
||||
normalized_clean * normalized_clean, dim=2))**.5 * (torch.sum(
|
||||
normalized_enhanced * normalized_enhanced, dim=2))**.5
|
||||
|
||||
ncc = inner_product / normalized_denom
|
||||
mod_mse_loss = torch.mean((ncc - 1.0)**2)
|
||||
|
||||
return mod_mse_loss
|
||||
|
||||
|
||||
class GaborSTRFConv(nn.Module):
|
||||
"""Gabor-STRF-based cross-correlation kernel."""
|
||||
|
||||
def __init__(self,
|
||||
supn,
|
||||
supk,
|
||||
nkern,
|
||||
rates=None,
|
||||
scales=None,
|
||||
norm_strf=True,
|
||||
real_only=False):
|
||||
"""Instantiate a Gabor-based STRF convolution layer.
|
||||
Parameters
|
||||
----------
|
||||
supn: int
|
||||
Time support in number of frames. Also the window length.
|
||||
supk: int
|
||||
Frequency support in number of channels. Also the window length.
|
||||
nkern: int
|
||||
Number of kernels, each with a learnable rate and scale.
|
||||
rates: list of float, None
|
||||
Initial values for temporal modulation.
|
||||
scales: list of float, None
|
||||
Initial values for spectral modulation.
|
||||
norm_strf: Boolean
|
||||
Normalize STRF kernels to be unit length
|
||||
real_only: Boolean
|
||||
If True, nkern REAL gabor-STRF kernels
|
||||
If False, nkern//2 REAL and nkern//2 IMAGINARY gabor-STRF kernels
|
||||
"""
|
||||
super(GaborSTRFConv, self).__init__()
|
||||
self.numN = supn
|
||||
self.numK = supk
|
||||
self.numKern = nkern
|
||||
self.real_only = real_only
|
||||
self.norm_strf = norm_strf
|
||||
|
||||
if not real_only:
|
||||
nkern = nkern // 2
|
||||
|
||||
if supk % 2 == 0: # force odd number
|
||||
supk += 1
|
||||
self.supk = torch.arange(supk, dtype=torch.float32)
|
||||
if supn % 2 == 0: # force odd number
|
||||
supn += 1
|
||||
self.supn = torch.arange(supn, dtype=self.supk.dtype)
|
||||
self.padding = (supn // 2, supk // 2)
|
||||
# Set up learnable parameters
|
||||
# for param in (rates, scales):
|
||||
# assert (not param) or len(param) == nkern
|
||||
if not rates:
|
||||
|
||||
rates = torch.rand(nkern) * math.pi / 2.0
|
||||
|
||||
if not scales:
|
||||
|
||||
scales = (torch.rand(nkern) * 2.0 - 1.0) * math.pi / 2.0
|
||||
|
||||
self.rates_ = nn.Parameter(torch.Tensor(rates))
|
||||
self.scales_ = nn.Parameter(torch.Tensor(scales))
|
||||
|
||||
def strfs(self):
|
||||
"""Make STRFs using the current parameters."""
|
||||
|
||||
if self.supn.device != self.rates_.device: # for first run
|
||||
self.supn = self.supn.to(self.rates_.device)
|
||||
self.supk = self.supk.to(self.rates_.device)
|
||||
n0, k0 = self.padding
|
||||
|
||||
nwind = .5 - .5 * \
|
||||
torch.cos(2 * math.pi * (self.supn + 1) / (len(self.supn) + 1))
|
||||
kwind = .5 - .5 * \
|
||||
torch.cos(2 * math.pi * (self.supk + 1) / (len(self.supk) + 1))
|
||||
|
||||
new_wind = torch.matmul((nwind).unsqueeze(-1), (kwind).unsqueeze(0))
|
||||
|
||||
n_n_0 = self.supn - n0
|
||||
k_k_0 = self.supk - k0
|
||||
n_mult = torch.matmul(
|
||||
n_n_0.unsqueeze(1),
|
||||
torch.ones((1, len(self.supk))).type(torch.FloatTensor).to(
|
||||
self.rates_.device))
|
||||
k_mult = torch.matmul(
|
||||
torch.ones((len(self.supn),
|
||||
1)).type(torch.FloatTensor).to(self.rates_.device),
|
||||
k_k_0.unsqueeze(0))
|
||||
|
||||
inside = self.rates_.unsqueeze(1).unsqueeze(
|
||||
1) * n_mult + self.scales_.unsqueeze(1).unsqueeze(1) * k_mult
|
||||
real_strf = torch.cos(inside) * new_wind.unsqueeze(0)
|
||||
|
||||
if self.real_only:
|
||||
final_strf = real_strf
|
||||
|
||||
else:
|
||||
imag_strf = torch.sin(inside) * new_wind.unsqueeze(0)
|
||||
final_strf = torch.cat([real_strf, imag_strf], dim=0)
|
||||
|
||||
if self.norm_strf:
|
||||
final_strf = final_strf / (torch.sum(
|
||||
final_strf**2, dim=(1, 2)).unsqueeze(1).unsqueeze(2))**.5
|
||||
|
||||
return final_strf
|
||||
|
||||
def forward(self, sigspec):
|
||||
"""Forward pass a batch of (real) spectra [Batch x Time x Frequency]."""
|
||||
if len(sigspec.shape) == 2: # expand batch dimension if single eg
|
||||
sigspec = sigspec.unsqueeze(0)
|
||||
strfs = self.strfs().unsqueeze(1).type_as(sigspec)
|
||||
out = F.conv2d(sigspec.unsqueeze(1), strfs, padding=self.padding)
|
||||
return out
|
||||
|
||||
def __repr__(self):
|
||||
"""Gabor filter"""
|
||||
report = """
|
||||
+++++ Gabor Filter Kernels [{}], supn[{}], supk[{}] real only [{}] norm strf [{}] +++++
|
||||
|
||||
""".format(self.numKern, self.numN, self.numK, self.real_only,
|
||||
self.norm_strf)
|
||||
|
||||
return report
|
||||
483
modelscope/models/audio/network/se_net.py
Normal file
483
modelscope/models/audio/network/se_net.py
Normal file
@@ -0,0 +1,483 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ..layers.activations import RectifiedLinear, Sigmoid
|
||||
from ..layers.affine_transform import AffineTransform
|
||||
from ..layers.deep_fsmn import DeepFsmn
|
||||
from ..layers.uni_deep_fsmn import Conv2d, UniDeepFsmn
|
||||
|
||||
|
||||
class MaskNet(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
hidden_dim=128,
|
||||
hidden_dim2=None,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(MaskNet, self).__init__()
|
||||
|
||||
self.linear1 = AffineTransform(indim, hidden_dim)
|
||||
self.relu = RectifiedLinear(hidden_dim, hidden_dim)
|
||||
if hidden_dim2 is None:
|
||||
hidden_dim2 = hidden_dim
|
||||
|
||||
if rorder == 0:
|
||||
repeats = [
|
||||
UniDeepFsmn(
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
lorder,
|
||||
hidden_dim2,
|
||||
dilation=dilation,
|
||||
layer_norm=layer_norm,
|
||||
dropout=dropout) for i in range(layers)
|
||||
]
|
||||
else:
|
||||
repeats = [
|
||||
DeepFsmn(
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
lorder,
|
||||
rorder,
|
||||
hidden_dim2,
|
||||
layer_norm=layer_norm,
|
||||
dropout=dropout) for i in range(layers)
|
||||
]
|
||||
self.deepfsmn = nn.Sequential(*repeats)
|
||||
|
||||
self.linear2 = AffineTransform(hidden_dim, outdim)
|
||||
|
||||
self.crm = crm
|
||||
if self.crm:
|
||||
self.sig = nn.Tanh()
|
||||
else:
|
||||
self.sig = Sigmoid(outdim, outdim)
|
||||
|
||||
self.vad = vad
|
||||
if self.vad:
|
||||
self.linear3 = AffineTransform(hidden_dim, 1)
|
||||
|
||||
self.layers = layers
|
||||
self.linearout = linearout
|
||||
if self.linearout and self.vad:
|
||||
print('Warning: not supported nnet')
|
||||
|
||||
def forward(self, feat, ctl=None):
|
||||
x1 = self.linear1(feat)
|
||||
x2 = self.relu(x1)
|
||||
if ctl is not None:
|
||||
ctl = min(ctl, self.layers - 1)
|
||||
for i in range(ctl):
|
||||
x2 = self.deepfsmn[i](x2)
|
||||
mask = self.sig(self.linear2(x2))
|
||||
if self.vad:
|
||||
vad = torch.sigmoid(self.linear3(x2))
|
||||
return mask, vad
|
||||
else:
|
||||
return mask
|
||||
x3 = self.deepfsmn(x2)
|
||||
if self.linearout:
|
||||
return self.linear2(x3)
|
||||
mask = self.sig(self.linear2(x3))
|
||||
if self.vad:
|
||||
vad = torch.sigmoid(self.linear3(x3))
|
||||
return mask, vad
|
||||
else:
|
||||
return mask
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = ''
|
||||
re_str += '<Nnet>\n'
|
||||
re_str += self.linear1.to_kaldi_nnet()
|
||||
re_str += self.relu.to_kaldi_nnet()
|
||||
for dfsmn in self.deepfsmn:
|
||||
re_str += dfsmn.to_kaldi_nnet()
|
||||
re_str += self.linear2.to_kaldi_nnet()
|
||||
re_str += self.sig.to_kaldi_nnet()
|
||||
re_str += '</Nnet>\n'
|
||||
|
||||
return re_str
|
||||
|
||||
def to_raw_nnet(self, fid):
|
||||
self.linear1.to_raw_nnet(fid)
|
||||
for dfsmn in self.deepfsmn:
|
||||
dfsmn.to_raw_nnet(fid)
|
||||
self.linear2.to_raw_nnet(fid)
|
||||
|
||||
|
||||
class StageNet(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
layers2=6,
|
||||
hidden_dim=128,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(StageNet, self).__init__()
|
||||
|
||||
self.stage1 = nn.ModuleList()
|
||||
self.stage2 = nn.ModuleList()
|
||||
layer = nn.Sequential(nn.Linear(indim, hidden_dim), nn.ReLU())
|
||||
self.stage1.append(layer)
|
||||
for i in range(layers):
|
||||
layer = UniDeepFsmn(
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
lorder,
|
||||
hidden_dim,
|
||||
layer_norm=layer_norm,
|
||||
dropout=dropout)
|
||||
self.stage1.append(layer)
|
||||
layer = nn.Sequential(nn.Linear(hidden_dim, 321), nn.Sigmoid())
|
||||
self.stage1.append(layer)
|
||||
# stage2
|
||||
layer = nn.Sequential(nn.Linear(321 + indim, hidden_dim), nn.ReLU())
|
||||
self.stage2.append(layer)
|
||||
for i in range(layers2):
|
||||
layer = UniDeepFsmn(
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
lorder,
|
||||
hidden_dim,
|
||||
layer_norm=layer_norm,
|
||||
dropout=dropout)
|
||||
self.stage2.append(layer)
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(hidden_dim, outdim),
|
||||
nn.Sigmoid() if not crm else nn.Tanh())
|
||||
self.stage2.append(layer)
|
||||
self.crm = crm
|
||||
self.vad = vad
|
||||
self.linearout = linearout
|
||||
self.window = torch.hamming_window(640, periodic=False).cuda()
|
||||
self.freezed = False
|
||||
|
||||
def freeze(self):
|
||||
if not self.freezed:
|
||||
for param in self.stage1.parameters():
|
||||
param.requires_grad = False
|
||||
self.freezed = True
|
||||
print('freezed stage1')
|
||||
|
||||
def forward(self, feat, mixture, ctl=None):
|
||||
if ctl == 'off':
|
||||
x = feat
|
||||
for i in range(len(self.stage1)):
|
||||
x = self.stage1[i](x)
|
||||
return x
|
||||
else:
|
||||
self.freeze()
|
||||
x = feat
|
||||
for i in range(len(self.stage1)):
|
||||
x = self.stage1[i](x)
|
||||
|
||||
spec = torch.stft(
|
||||
mixture / 32768,
|
||||
640,
|
||||
320,
|
||||
640,
|
||||
self.window,
|
||||
center=False,
|
||||
return_complex=True)
|
||||
spec = torch.view_as_real(spec).permute([0, 2, 1, 3])
|
||||
specmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2)
|
||||
est = x * specmag
|
||||
y = torch.cat([est, feat], dim=-1)
|
||||
for i in range(len(self.stage2)):
|
||||
y = self.stage2[i](y)
|
||||
return y
|
||||
|
||||
|
||||
class Unet(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
dims=[256] * 4,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(Unet, self).__init__()
|
||||
|
||||
self.linear1 = AffineTransform(indim, dims[0])
|
||||
self.relu = RectifiedLinear(dims[0], dims[0])
|
||||
|
||||
self.encoder = nn.ModuleList()
|
||||
self.decoder = nn.ModuleList()
|
||||
for i in range(len(dims) - 1):
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(dims[i], dims[i + 1]), nn.ReLU(),
|
||||
nn.Linear(dims[i + 1], dims[i + 1], bias=False),
|
||||
Conv2d(
|
||||
dims[i + 1],
|
||||
dims[i + 1],
|
||||
lorder,
|
||||
groups=dims[i + 1],
|
||||
skip_connect=True))
|
||||
self.encoder.append(layer)
|
||||
for i in range(len(dims) - 1, 0, -1):
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(dims[i] * 2, dims[i - 1]), nn.ReLU(),
|
||||
nn.Linear(dims[i - 1], dims[i - 1], bias=False),
|
||||
Conv2d(
|
||||
dims[i - 1],
|
||||
dims[i - 1],
|
||||
lorder,
|
||||
groups=dims[i - 1],
|
||||
skip_connect=True))
|
||||
self.decoder.append(layer)
|
||||
self.tf = nn.ModuleList()
|
||||
for i in range(layers - 2 * (len(dims) - 1)):
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(dims[-1], dims[-1]), nn.ReLU(),
|
||||
nn.Linear(dims[-1], dims[-1], bias=False),
|
||||
Conv2d(
|
||||
dims[-1],
|
||||
dims[-1],
|
||||
lorder,
|
||||
groups=dims[-1],
|
||||
skip_connect=True))
|
||||
self.tf.append(layer)
|
||||
|
||||
self.linear2 = AffineTransform(dims[0], outdim)
|
||||
self.crm = crm
|
||||
self.act = nn.Tanh() if self.crm else nn.Sigmoid()
|
||||
self.vad = False
|
||||
self.layers = layers
|
||||
self.linearout = linearout
|
||||
|
||||
def forward(self, x, ctl=None):
|
||||
x = self.linear1(x)
|
||||
x = self.relu(x)
|
||||
|
||||
encoder_out = []
|
||||
for i in range(len(self.encoder)):
|
||||
x = self.encoder[i](x)
|
||||
encoder_out.append(x)
|
||||
for i in range(len(self.tf)):
|
||||
x = self.tf[i](x)
|
||||
for i in range(len(self.decoder)):
|
||||
x = torch.cat([x, encoder_out[-1 - i]], dim=-1)
|
||||
x = self.decoder[i](x)
|
||||
|
||||
x = self.linear2(x)
|
||||
if self.linearout:
|
||||
return x
|
||||
return self.act(x)
|
||||
|
||||
|
||||
class BranchNet(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
hidden_dim=256,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
dilation=1,
|
||||
layer_norm=False,
|
||||
dropout=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(BranchNet, self).__init__()
|
||||
|
||||
self.linear1 = AffineTransform(indim, hidden_dim)
|
||||
self.relu = RectifiedLinear(hidden_dim, hidden_dim)
|
||||
|
||||
self.convs = nn.ModuleList()
|
||||
self.deepfsmn = nn.ModuleList()
|
||||
self.FREQ = nn.ModuleList()
|
||||
self.TIME = nn.ModuleList()
|
||||
self.br1 = nn.ModuleList()
|
||||
self.br2 = nn.ModuleList()
|
||||
for i in range(layers):
|
||||
'''
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, hidden_dim, bias=False),
|
||||
Conv2d(hidden_dim, hidden_dim, lorder,
|
||||
groups=hidden_dim, skip_connect=True)
|
||||
)
|
||||
self.deepfsmn.append(layer)
|
||||
'''
|
||||
layer = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
|
||||
self.FREQ.append(layer)
|
||||
'''
|
||||
layer = nn.GRU(hidden_dim, hidden_dim,
|
||||
batch_first=True,
|
||||
bidirectional=False)
|
||||
self.TIME.append(layer)
|
||||
|
||||
layer = nn.Sequential(
|
||||
nn.Linear(hidden_dim, hidden_dim//2, bias=False),
|
||||
Conv2d(hidden_dim//2, hidden_dim//2, lorder,
|
||||
groups=hidden_dim//2, skip_connect=True)
|
||||
)
|
||||
self.br1.append(layer)
|
||||
layer = nn.GRU(hidden_dim, hidden_dim//2,
|
||||
batch_first=True,
|
||||
bidirectional=False)
|
||||
self.br2.append(layer)
|
||||
'''
|
||||
|
||||
self.linear2 = AffineTransform(hidden_dim, outdim)
|
||||
self.crm = crm
|
||||
self.act = nn.Tanh() if self.crm else nn.Sigmoid()
|
||||
self.vad = False
|
||||
self.layers = layers
|
||||
self.linearout = linearout
|
||||
|
||||
def forward(self, x, ctl=None):
|
||||
return self.forward_branch(x)
|
||||
|
||||
def forward_sepconv(self, x):
|
||||
x = torch.unsqueeze(x, 1)
|
||||
for i in range(len(self.convs)):
|
||||
x = self.convs[i](x)
|
||||
x = F.relu(x)
|
||||
B, C, H, W = x.shape
|
||||
x = x.permute(0, 2, 1, 3)
|
||||
x = torch.reshape(x, [B, H, C * W])
|
||||
x = self.linear1(x)
|
||||
x = self.relu(x)
|
||||
for i in range(self.layers):
|
||||
x = self.deepfsmn[i](x) + x
|
||||
x = self.linear2(x)
|
||||
return self.act(x)
|
||||
|
||||
def forward_branch(self, x):
|
||||
x = self.linear1(x)
|
||||
x = self.relu(x)
|
||||
for i in range(self.layers):
|
||||
z = self.FREQ[i](x)
|
||||
x = z + x
|
||||
x = self.linear2(x)
|
||||
if self.linearout:
|
||||
return x
|
||||
return self.act(x)
|
||||
|
||||
|
||||
class TACNet(nn.Module):
|
||||
''' transform average concatenate for ad hoc dr
|
||||
'''
|
||||
|
||||
def __init__(self,
|
||||
indim,
|
||||
outdim,
|
||||
layers=9,
|
||||
hidden_dim=128,
|
||||
lorder=20,
|
||||
rorder=0,
|
||||
crm=False,
|
||||
vad=False,
|
||||
linearout=False):
|
||||
super(TACNet, self).__init__()
|
||||
|
||||
self.linear1 = AffineTransform(indim, hidden_dim)
|
||||
self.relu = RectifiedLinear(hidden_dim, hidden_dim)
|
||||
|
||||
if rorder == 0:
|
||||
repeats = [
|
||||
UniDeepFsmn(hidden_dim, hidden_dim, lorder, hidden_dim)
|
||||
for i in range(layers)
|
||||
]
|
||||
else:
|
||||
repeats = [
|
||||
DeepFsmn(hidden_dim, hidden_dim, lorder, rorder, hidden_dim)
|
||||
for i in range(layers)
|
||||
]
|
||||
self.deepfsmn = nn.Sequential(*repeats)
|
||||
|
||||
self.ch_transform = nn.ModuleList([])
|
||||
self.ch_average = nn.ModuleList([])
|
||||
self.ch_concat = nn.ModuleList([])
|
||||
for i in range(layers):
|
||||
self.ch_transform.append(
|
||||
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
|
||||
self.ch_average.append(
|
||||
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
|
||||
self.ch_concat.append(
|
||||
nn.Sequential(
|
||||
nn.Linear(hidden_dim * 2, hidden_dim), nn.PReLU()))
|
||||
|
||||
self.linear2 = AffineTransform(hidden_dim, outdim)
|
||||
|
||||
self.crm = crm
|
||||
if self.crm:
|
||||
self.sig = nn.Tanh()
|
||||
else:
|
||||
self.sig = Sigmoid(outdim, outdim)
|
||||
|
||||
self.vad = vad
|
||||
if self.vad:
|
||||
self.linear3 = AffineTransform(hidden_dim, 1)
|
||||
|
||||
self.layers = layers
|
||||
self.linearout = linearout
|
||||
if self.linearout and self.vad:
|
||||
print('Warning: not supported nnet')
|
||||
|
||||
def forward(self, feat, ctl=None):
|
||||
B, T, F = feat.shape
|
||||
# assume 4ch
|
||||
ch = 4
|
||||
zlist = []
|
||||
for c in range(ch):
|
||||
z = self.linear1(feat[..., c * (F // 4):(c + 1) * (F // 4)])
|
||||
z = self.relu(z)
|
||||
zlist.append(z)
|
||||
for i in range(self.layers):
|
||||
# forward
|
||||
for c in range(ch):
|
||||
zlist[c] = self.deepfsmn[i](zlist[c])
|
||||
|
||||
# transform
|
||||
olist = []
|
||||
for c in range(ch):
|
||||
z = self.ch_transform[i](zlist[c])
|
||||
olist.append(z)
|
||||
# average
|
||||
avg = 0
|
||||
for c in range(ch):
|
||||
avg = avg + olist[c]
|
||||
avg = avg / ch
|
||||
avg = self.ch_average[i](avg)
|
||||
# concate
|
||||
for c in range(ch):
|
||||
tac = torch.cat([olist[c], avg], dim=-1)
|
||||
tac = self.ch_concat[i](tac)
|
||||
zlist[c] = zlist[c] + tac
|
||||
|
||||
for c in range(ch):
|
||||
zlist[c] = self.sig(self.linear2(zlist[c]))
|
||||
mask = torch.cat(zlist, dim=-1)
|
||||
return mask
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
pass
|
||||
@@ -2,14 +2,13 @@
|
||||
|
||||
import os.path as osp
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Tuple, Union
|
||||
from typing import Dict, Union
|
||||
|
||||
from maas_hub.file_download import model_file_download
|
||||
from maas_hub.snapshot_download import snapshot_download
|
||||
|
||||
from modelscope.models.builder import build_model
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.constant import CONFIGFILE
|
||||
from modelscope.utils.constant import ModelFile
|
||||
from modelscope.utils.hub import get_model_cache_dir
|
||||
|
||||
Tensor = Union['torch.Tensor', 'tf.Tensor']
|
||||
@@ -21,16 +20,24 @@ class Model(ABC):
|
||||
self.model_dir = model_dir
|
||||
|
||||
def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
return self.post_process(self.forward(input))
|
||||
return self.postprocess(self.forward(input))
|
||||
|
||||
@abstractmethod
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
pass
|
||||
|
||||
def post_process(self, input: Dict[str, Tensor],
|
||||
**kwargs) -> Dict[str, Tensor]:
|
||||
# model specific postprocess, implementation is optional
|
||||
# will be called in Pipeline and evaluation loop(in the future)
|
||||
def postprocess(self, input: Dict[str, Tensor],
|
||||
**kwargs) -> Dict[str, Tensor]:
|
||||
""" Model specific postprocess and convert model output to
|
||||
standard model outputs.
|
||||
|
||||
Args:
|
||||
inputs: input data
|
||||
|
||||
Return:
|
||||
dict of results: a dict containing outputs of model, each
|
||||
output should have the standard output name.
|
||||
"""
|
||||
return input
|
||||
|
||||
@classmethod
|
||||
@@ -47,7 +54,8 @@ class Model(ABC):
|
||||
# raise ValueError(
|
||||
# 'Remote model repo {model_name_or_path} does not exists')
|
||||
|
||||
cfg = Config.from_file(osp.join(local_model_dir, CONFIGFILE))
|
||||
cfg = Config.from_file(
|
||||
osp.join(local_model_dir, ModelFile.CONFIGURATION))
|
||||
task_name = cfg.task
|
||||
model_cfg = cfg.model
|
||||
# TODO @wenmeng.zwm may should manually initialize model after model building
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from .sequence_classification_model import * # noqa F403
|
||||
from .bert_for_sequence_classification import * # noqa F403
|
||||
from .palm_for_text_generation import * # noqa F403
|
||||
from .sbert_for_sentence_similarity import * # noqa F403
|
||||
from .sbert_for_token_classification import * # noqa F403
|
||||
from .space.dialog_intent_prediction_model import * # noqa F403
|
||||
from .space.dialog_modeling_model import * # noqa F403
|
||||
from .text_generation_model import * # noqa F403
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
@@ -34,6 +36,11 @@ class BertForSequenceClassification(Model):
|
||||
('token_type_ids', torch.LongTensor)],
|
||||
output_keys=['predictions', 'probabilities', 'logits'])
|
||||
|
||||
self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
|
||||
with open(self.label_path) as f:
|
||||
self.label_mapping = json.load(f)
|
||||
self.id2label = {idx: name for name, idx in self.label_mapping.items()}
|
||||
|
||||
def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
|
||||
"""return the result by the model
|
||||
|
||||
@@ -50,3 +57,13 @@ class BertForSequenceClassification(Model):
|
||||
}
|
||||
"""
|
||||
return self.model.predict(input)
|
||||
|
||||
def postprocess(self, inputs: Dict[str, np.ndarray],
|
||||
**kwargs) -> Dict[str, np.ndarray]:
|
||||
# N x num_classes
|
||||
probs = inputs['probabilities']
|
||||
result = {
|
||||
'probs': probs,
|
||||
}
|
||||
|
||||
return result
|
||||
43
modelscope/models/nlp/palm_for_text_generation.py
Normal file
43
modelscope/models/nlp/palm_for_text_generation.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import Dict
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Model, Tensor
|
||||
from ..builder import MODELS
|
||||
|
||||
__all__ = ['PalmForTextGeneration']
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0')
|
||||
class PalmForTextGeneration(Model):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the text generation model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
model_cls (Optional[Any], optional): model loader, if None, use the
|
||||
default loader to load model weights, by default None.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
|
||||
from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
|
||||
model = PalmForConditionalGeneration.from_pretrained(model_dir)
|
||||
self.tokenizer = model.tokenizer
|
||||
self.generator = Translator(model)
|
||||
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Tensor]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, Tensor]: results
|
||||
Example:
|
||||
{
|
||||
'predictions': Tensor([[1377, 4959, 2785, 6392...])]), # tokens need to be decode by tokenizer
|
||||
}
|
||||
"""
|
||||
|
||||
return self.generator(**input)
|
||||
88
modelscope/models/nlp/sbert_for_sentence_similarity.py
Normal file
88
modelscope/models/nlp/sbert_for_sentence_similarity.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
from sofa import SbertModel
|
||||
from sofa.models.sbert.modeling_sbert import SbertPreTrainedModel
|
||||
from torch import nn
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Model, Tensor
|
||||
from ..builder import MODELS
|
||||
|
||||
__all__ = ['SbertForSentenceSimilarity']
|
||||
|
||||
|
||||
class SbertTextClassifier(SbertPreTrainedModel):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
self.config = config
|
||||
self.encoder = SbertModel(config, add_pooling_layer=True)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
def forward(self, input_ids=None, token_type_ids=None):
|
||||
outputs = self.encoder(
|
||||
input_ids,
|
||||
token_type_ids=token_type_ids,
|
||||
return_dict=None,
|
||||
)
|
||||
pooled_output = outputs[1]
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
return logits
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.sentence_similarity,
|
||||
module_name=r'sbert-base-chinese-sentence-similarity')
|
||||
class SbertForSentenceSimilarity(Model):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the sentence similarity model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
model_cls (Optional[Any], optional): model loader, if None, use the
|
||||
default loader to load model weights, by default None.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
|
||||
self.model = SbertTextClassifier.from_pretrained(
|
||||
model_dir, num_labels=2)
|
||||
self.model.eval()
|
||||
self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
|
||||
with open(self.label_path) as f:
|
||||
self.label_mapping = json.load(f)
|
||||
self.id2label = {idx: name for name, idx in self.label_mapping.items()}
|
||||
|
||||
def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Any]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, np.ndarray]: results
|
||||
Example:
|
||||
{
|
||||
'predictions': array([1]), # lable 0-negative 1-positive
|
||||
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
|
||||
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
|
||||
}
|
||||
"""
|
||||
input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
|
||||
token_type_ids = torch.tensor(
|
||||
input['token_type_ids'], dtype=torch.long)
|
||||
with torch.no_grad():
|
||||
logits = self.model(input_ids, token_type_ids)
|
||||
probs = logits.softmax(-1).numpy()
|
||||
pred = logits.argmax(-1).numpy()
|
||||
logits = logits.numpy()
|
||||
res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
|
||||
return res
|
||||
56
modelscope/models/nlp/sbert_for_token_classification.py
Normal file
56
modelscope/models/nlp/sbert_for_token_classification.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from sofa import SbertConfig, SbertForTokenClassification
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Model, Tensor
|
||||
from ..builder import MODELS
|
||||
|
||||
__all__ = ['StructBertForTokenClassification']
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.word_segmentation,
|
||||
module_name=r'structbert-chinese-word-segmentation')
|
||||
class StructBertForTokenClassification(Model):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the word segmentation model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
model_cls (Optional[Any], optional): model loader, if None, use the
|
||||
default loader to load model weights, by default None.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
self.model = SbertForTokenClassification.from_pretrained(
|
||||
self.model_dir)
|
||||
self.config = SbertConfig.from_pretrained(self.model_dir)
|
||||
|
||||
def forward(self, input: Dict[str,
|
||||
Any]) -> Dict[str, Union[str, np.ndarray]]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Any]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, Union[str,np.ndarray]]: results
|
||||
Example:
|
||||
{
|
||||
'predictions': array([1,4]), # lable 0-negative 1-positive
|
||||
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
|
||||
'text': str(今天),
|
||||
}
|
||||
"""
|
||||
input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
|
||||
output = self.model(input_ids)
|
||||
logits = output.logits
|
||||
pred = torch.argmax(logits[0], dim=-1)
|
||||
pred = pred.numpy()
|
||||
|
||||
rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
|
||||
return rst
|
||||
@@ -1,52 +0,0 @@
|
||||
from typing import Any, Dict
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Model, Tensor
|
||||
from ..builder import MODELS
|
||||
|
||||
__all__ = ['PalmForTextGenerationModel']
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.text_generation, module_name=r'palm')
|
||||
class PalmForTextGenerationModel(Model):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the text generation model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
model_cls (Optional[Any], optional): model loader, if None, use the
|
||||
default loader to load model weights, by default None.
|
||||
"""
|
||||
from sofa import PalmTokenizer
|
||||
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
|
||||
from sofa.models.palm import PalmForConditionalGeneration, TextGenerator
|
||||
tokenizer = kwargs.pop('tokenizer',
|
||||
PalmTokenizer.from_pretrained(model_dir))
|
||||
model = PalmForConditionalGeneration.from_pretrained(model_dir)
|
||||
self.generator = TextGenerator(model, tokenizer)
|
||||
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Any]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, np.ndarray]: results
|
||||
Example:
|
||||
{
|
||||
'predictions': array([1]), # lable 0-negative 1-positive
|
||||
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
|
||||
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
|
||||
}
|
||||
"""
|
||||
|
||||
encoder_inputs = [
|
||||
input['input_ids'], input['token_type_ids'],
|
||||
input['attention_mask']
|
||||
]
|
||||
return self.generator(encoder_inputs)
|
||||
@@ -1,4 +1,4 @@
|
||||
from .audio import * # noqa F403
|
||||
from .audio import LinearAECPipeline
|
||||
from .base import Pipeline
|
||||
from .builder import pipeline
|
||||
from .cv import * # noqa F403
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
from .linear_aec_pipeline import LinearAECPipeline
|
||||
|
||||
160
modelscope/pipelines/audio/linear_aec_pipeline.py
Normal file
160
modelscope/pipelines/audio/linear_aec_pipeline.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import importlib
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
import scipy.io.wavfile as wav
|
||||
import torch
|
||||
import yaml
|
||||
|
||||
from modelscope.preprocessors.audio import LinearAECAndFbank
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from ..base import Pipeline
|
||||
from ..builder import PIPELINES
|
||||
|
||||
FEATURE_MVN = 'feature.DEY.mvn.txt'
|
||||
|
||||
CONFIG_YAML = 'dey_mini.yaml'
|
||||
|
||||
|
||||
def initialize_config(module_cfg):
|
||||
r"""According to config items, load specific module dynamically with params.
|
||||
1. Load the module corresponding to the "module" param.
|
||||
2. Call function (or instantiate class) corresponding to the "main" param.
|
||||
3. Send the param (in "args") into the function (or class) when calling ( or instantiating).
|
||||
|
||||
Args:
|
||||
module_cfg (dict): config items, eg:
|
||||
{
|
||||
"module": "models.model",
|
||||
"main": "Model",
|
||||
"args": {...}
|
||||
}
|
||||
|
||||
Returns:
|
||||
the module loaded.
|
||||
"""
|
||||
module = importlib.import_module(module_cfg['module'])
|
||||
return getattr(module, module_cfg['main'])(**module_cfg['args'])
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k')
|
||||
class LinearAECPipeline(Pipeline):
|
||||
r"""AEC Inference Pipeline only support 16000 sample rate.
|
||||
|
||||
When invoke the class with pipeline.__call__(), you should provide two params:
|
||||
Dict[str, Any]
|
||||
the path of wav files,eg:{
|
||||
"nearend_mic": "/your/data/near_end_mic_audio.wav",
|
||||
"farend_speech": "/your/data/far_end_speech_audio.wav"}
|
||||
output_path (str, optional): "/your/output/audio_after_aec.wav"
|
||||
the file path to write generate audio.
|
||||
"""
|
||||
|
||||
def __init__(self, model):
|
||||
r"""
|
||||
Args:
|
||||
model: model id on modelscope hub.
|
||||
"""
|
||||
super().__init__(model=model)
|
||||
self.use_cuda = torch.cuda.is_available()
|
||||
with open(
|
||||
os.path.join(self.model, CONFIG_YAML), encoding='utf-8') as f:
|
||||
self.config = yaml.full_load(f.read())
|
||||
self.config['io']['mvn'] = os.path.join(self.model, FEATURE_MVN)
|
||||
self._init_model()
|
||||
self.preprocessor = LinearAECAndFbank(self.config['io'])
|
||||
|
||||
n_fft = self.config['loss']['args']['n_fft']
|
||||
hop_length = self.config['loss']['args']['hop_length']
|
||||
winlen = n_fft
|
||||
window = torch.hamming_window(winlen, periodic=False)
|
||||
|
||||
def stft(x):
|
||||
return torch.stft(
|
||||
x,
|
||||
n_fft,
|
||||
hop_length,
|
||||
winlen,
|
||||
center=False,
|
||||
window=window.to(x.device),
|
||||
return_complex=False)
|
||||
|
||||
def istft(x, slen):
|
||||
return torch.istft(
|
||||
x,
|
||||
n_fft,
|
||||
hop_length,
|
||||
winlen,
|
||||
window=window.to(x.device),
|
||||
center=False,
|
||||
length=slen)
|
||||
|
||||
self.stft = stft
|
||||
self.istft = istft
|
||||
|
||||
def _init_model(self):
|
||||
checkpoint = torch.load(
|
||||
os.path.join(self.model, ModelFile.TORCH_MODEL_BIN_FILE),
|
||||
map_location='cpu')
|
||||
self.model = initialize_config(self.config['nnet'])
|
||||
if self.use_cuda:
|
||||
self.model = self.model.cuda()
|
||||
self.model.load_state_dict(checkpoint)
|
||||
|
||||
def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
r"""The AEC process.
|
||||
|
||||
Args:
|
||||
inputs: dict={'feature': Tensor, 'base': Tensor}
|
||||
'feature' feature of input audio.
|
||||
'base' the base audio to mask.
|
||||
|
||||
Returns:
|
||||
dict:
|
||||
{
|
||||
'output_pcm': generated audio array
|
||||
}
|
||||
"""
|
||||
output_data = self._process(inputs['feature'], inputs['base'])
|
||||
return {'output_pcm': output_data}
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
||||
r"""The post process. Will save audio to file, if the output_path is given.
|
||||
|
||||
Args:
|
||||
inputs: dict:
|
||||
{
|
||||
'output_pcm': generated audio array
|
||||
}
|
||||
kwargs: accept 'output_path' which is the path to write generated audio
|
||||
|
||||
Returns:
|
||||
dict:
|
||||
{
|
||||
'output_pcm': generated audio array
|
||||
}
|
||||
"""
|
||||
if 'output_path' in kwargs.keys():
|
||||
wav.write(kwargs['output_path'], self.preprocessor.SAMPLE_RATE,
|
||||
inputs['output_pcm'].astype(np.int16))
|
||||
inputs['output_pcm'] = inputs['output_pcm'] / 32768.0
|
||||
return inputs
|
||||
|
||||
def _process(self, fbanks, mixture):
|
||||
if self.use_cuda:
|
||||
fbanks = fbanks.cuda()
|
||||
mixture = mixture.cuda()
|
||||
if self.model.vad:
|
||||
with torch.no_grad():
|
||||
masks, vad = self.model(fbanks.unsqueeze(0))
|
||||
masks = masks.permute([2, 1, 0])
|
||||
else:
|
||||
with torch.no_grad():
|
||||
masks = self.model(fbanks.unsqueeze(0))
|
||||
masks = masks.permute([2, 1, 0])
|
||||
spectrum = self.stft(mixture)
|
||||
masked_spec = spectrum * masks
|
||||
masked_sig = self.istft(masked_spec, len(mixture)).cpu().numpy()
|
||||
return masked_sig
|
||||
@@ -12,10 +12,11 @@ from modelscope.pydatasets import PyDataset
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.hub import get_model_cache_dir
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .outputs import TASK_OUTPUTS
|
||||
from .util import is_model_name
|
||||
|
||||
Tensor = Union['torch.Tensor', 'tf.Tensor']
|
||||
Input = Union[str, PyDataset, Dict, 'PIL.Image.Image', 'numpy.ndarray']
|
||||
Input = Union[str, tuple, dict, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
|
||||
InputModel = Union[str, Model]
|
||||
|
||||
output_keys = [
|
||||
@@ -106,8 +107,25 @@ class Pipeline(ABC):
|
||||
out = self.preprocess(input, **post_kwargs)
|
||||
out = self.forward(out)
|
||||
out = self.postprocess(out, **post_kwargs)
|
||||
self._check_output(out)
|
||||
return out
|
||||
|
||||
def _check_output(self, input):
|
||||
# this attribute is dynamically attached by registry
|
||||
# when cls is registered in registry using task name
|
||||
task_name = self.group_key
|
||||
if task_name not in TASK_OUTPUTS:
|
||||
logger.warning(f'task {task_name} output keys are missing')
|
||||
return
|
||||
output_keys = TASK_OUTPUTS[task_name]
|
||||
missing_keys = []
|
||||
for k in output_keys:
|
||||
if k not in input:
|
||||
missing_keys.append(k)
|
||||
if len(missing_keys) > 0:
|
||||
raise ValueError(f'expected output keys are {output_keys}, '
|
||||
f'those {missing_keys} are missing')
|
||||
|
||||
def preprocess(self, inputs: Input) -> Dict[str, Any]:
|
||||
""" Provide default implementation based on preprocess_cfg and user can reimplement it
|
||||
"""
|
||||
@@ -125,4 +143,14 @@ class Pipeline(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
""" If current pipeline support model reuse, common postprocess
|
||||
code should be write here.
|
||||
|
||||
Args:
|
||||
inputs: input data
|
||||
|
||||
Return:
|
||||
dict of results: a dict containing outputs of model, each
|
||||
output should have the standard output name.
|
||||
"""
|
||||
raise NotImplementedError('postprocess')
|
||||
|
||||
@@ -3,24 +3,27 @@
|
||||
import os.path as osp
|
||||
from typing import List, Union
|
||||
|
||||
import json
|
||||
from maas_hub.file_download import model_file_download
|
||||
|
||||
from modelscope.models.base import Model
|
||||
from modelscope.utils.config import Config, ConfigDict
|
||||
from modelscope.utils.constant import CONFIGFILE, Tasks
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.registry import Registry, build_from_cfg
|
||||
from .base import Pipeline
|
||||
from .util import is_model_name
|
||||
|
||||
PIPELINES = Registry('pipelines')
|
||||
|
||||
DEFAULT_MODEL_FOR_PIPELINE = {
|
||||
# TaskName: (pipeline_module_name, model_repo)
|
||||
Tasks.image_matting: ('image-matting', 'damo/image-matting-person'),
|
||||
Tasks.word_segmentation:
|
||||
('structbert-chinese-word-segmentation',
|
||||
'damo/nlp_structbert_word-segmentation_chinese-base'),
|
||||
Tasks.sentence_similarity:
|
||||
('sbert-base-chinese-sentence-similarity',
|
||||
'damo/nlp_structbert_sentence-similarity_chinese-base'),
|
||||
Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
|
||||
Tasks.text_classification:
|
||||
('bert-sentiment-analysis', 'damo/bert-base-sst2'),
|
||||
Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'),
|
||||
Tasks.text_generation: ('palm2.0',
|
||||
'damo/nlp_palm2.0_text-generation_chinese-base'),
|
||||
Tasks.image_captioning: ('ofa', None),
|
||||
Tasks.image_generation:
|
||||
('person-image-cartoon',
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import os.path as osp
|
||||
from typing import Any, Dict, List, Tuple, Union
|
||||
from typing import Any, Dict
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
@@ -7,7 +7,7 @@ import PIL
|
||||
|
||||
from modelscope.pipelines.base import Input
|
||||
from modelscope.preprocessors import load_image
|
||||
from modelscope.utils.constant import TF_GRAPH_FILE, Tasks
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from ..base import Pipeline
|
||||
from ..builder import PIPELINES
|
||||
@@ -24,7 +24,7 @@ class ImageMattingPipeline(Pipeline):
|
||||
import tensorflow as tf
|
||||
if tf.__version__ >= '2.0':
|
||||
tf = tf.compat.v1
|
||||
model_path = osp.join(self.model, TF_GRAPH_FILE)
|
||||
model_path = osp.join(self.model, ModelFile.TF_GRAPH_FILE)
|
||||
|
||||
config = tf.ConfigProto(allow_soft_placement=True)
|
||||
config.gpu_options.allow_growth = True
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .image_captioning import ImageCaptionPipeline
|
||||
from .image_caption_pipeline import ImageCaptionPipeline
|
||||
|
||||
@@ -84,8 +84,11 @@ class ImageCaptionPipeline(Pipeline):
|
||||
s = torch.cat([s, self.eos_item])
|
||||
return s
|
||||
|
||||
patch_image = self.patch_resize_transform(
|
||||
load_image(input)).unsqueeze(0)
|
||||
if isinstance(input, Image.Image):
|
||||
patch_image = self.patch_resize_transform(input).unsqueeze(0)
|
||||
else:
|
||||
patch_image = self.patch_resize_transform(
|
||||
load_image(input)).unsqueeze(0)
|
||||
patch_mask = torch.tensor([True])
|
||||
text = 'what does the image describe?'
|
||||
src_text = encode_text(
|
||||
@@ -1,4 +1,6 @@
|
||||
from .sentence_similarity_pipeline import * # noqa F403
|
||||
from .sequence_classification_pipeline import * # noqa F403
|
||||
from .space.dialog_intent_prediction_pipeline import * # noqa F403
|
||||
from .space.dialog_modeling_pipeline import * # noqa F403
|
||||
from .text_generation_pipeline import * # noqa F403
|
||||
from .word_segmentation_pipeline import * # noqa F403
|
||||
|
||||
62
modelscope/pipelines/nlp/sentence_similarity_pipeline.py
Normal file
62
modelscope/pipelines/nlp/sentence_similarity_pipeline.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from modelscope.models.nlp import SbertForSentenceSimilarity
|
||||
from modelscope.preprocessors import SequenceClassificationPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ...models import Model
|
||||
from ..base import Input, Pipeline
|
||||
from ..builder import PIPELINES
|
||||
|
||||
__all__ = ['SentenceSimilarityPipeline']
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.sentence_similarity,
|
||||
module_name=r'sbert-base-chinese-sentence-similarity')
|
||||
class SentenceSimilarityPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[SbertForSentenceSimilarity, str],
|
||||
preprocessor: SequenceClassificationPreprocessor = None,
|
||||
**kwargs):
|
||||
"""use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction
|
||||
|
||||
Args:
|
||||
model (SbertForSentenceSimilarity): a model instance
|
||||
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
|
||||
"""
|
||||
assert isinstance(model, str) or isinstance(model, SbertForSentenceSimilarity), \
|
||||
'model must be a single str or SbertForSentenceSimilarity'
|
||||
sc_model = model if isinstance(
|
||||
model,
|
||||
SbertForSentenceSimilarity) else Model.from_pretrained(model)
|
||||
if preprocessor is None:
|
||||
preprocessor = SequenceClassificationPreprocessor(
|
||||
sc_model.model_dir,
|
||||
first_sequence='first_sequence',
|
||||
second_sequence='second_sequence')
|
||||
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
|
||||
|
||||
assert hasattr(self.model, 'id2label'), \
|
||||
'id2label map should be initalizaed in init function.'
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""process the prediction results
|
||||
|
||||
Args:
|
||||
inputs (Dict[str, Any]): _description_
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: the prediction results
|
||||
"""
|
||||
|
||||
probs = inputs['probabilities'][0]
|
||||
num_classes = probs.shape[0]
|
||||
top_indices = np.argpartition(probs, -num_classes)[-num_classes:]
|
||||
cls_ids = top_indices[np.argsort(-probs[top_indices], axis=-1)]
|
||||
probs = probs[cls_ids].tolist()
|
||||
cls_names = [self.model.id2label[cid] for cid in cls_ids]
|
||||
b = 0
|
||||
return {'scores': probs[b], 'labels': cls_names[b]}
|
||||
@@ -1,8 +1,5 @@
|
||||
import os
|
||||
import uuid
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
from modelscope.models.nlp import BertForSequenceClassification
|
||||
@@ -41,50 +38,29 @@ class SequenceClassificationPipeline(Pipeline):
|
||||
second_sequence=None)
|
||||
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
|
||||
|
||||
from easynlp.utils import io
|
||||
self.label_path = os.path.join(sc_model.model_dir,
|
||||
'label_mapping.json')
|
||||
with io.open(self.label_path) as f:
|
||||
self.label_mapping = json.load(f)
|
||||
self.label_id_to_name = {
|
||||
idx: name
|
||||
for name, idx in self.label_mapping.items()
|
||||
}
|
||||
assert hasattr(self.model, 'id2label'), \
|
||||
'id2label map should be initalizaed in init function.'
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
def postprocess(self,
|
||||
inputs: Dict[str, Any],
|
||||
topk: int = 5) -> Dict[str, str]:
|
||||
"""process the prediction results
|
||||
|
||||
Args:
|
||||
inputs (Dict[str, Any]): _description_
|
||||
inputs (Dict[str, Any]): input data dict
|
||||
topk (int): return topk classification result.
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: the prediction results
|
||||
"""
|
||||
# NxC np.ndarray
|
||||
probs = inputs['probs'][0]
|
||||
num_classes = probs.shape[0]
|
||||
topk = min(topk, num_classes)
|
||||
top_indices = np.argpartition(probs, -topk)[-topk:]
|
||||
cls_ids = top_indices[np.argsort(probs[top_indices])]
|
||||
probs = probs[cls_ids].tolist()
|
||||
|
||||
probs = inputs['probabilities']
|
||||
logits = inputs['logits']
|
||||
predictions = np.argsort(-probs, axis=-1)
|
||||
preds = predictions[0]
|
||||
b = 0
|
||||
new_result = list()
|
||||
for pred in preds:
|
||||
new_result.append({
|
||||
'pred': self.label_id_to_name[pred],
|
||||
'prob': float(probs[b][pred]),
|
||||
'logit': float(logits[b][pred])
|
||||
})
|
||||
new_results = list()
|
||||
new_results.append({
|
||||
'id':
|
||||
inputs['id'][b] if 'id' in inputs else str(uuid.uuid4()),
|
||||
'output':
|
||||
new_result,
|
||||
'predictions':
|
||||
new_result[0]['pred'],
|
||||
'probabilities':
|
||||
','.join([str(t) for t in inputs['probabilities'][b]]),
|
||||
'logits':
|
||||
','.join([str(t) for t in inputs['logits'][b]])
|
||||
})
|
||||
cls_names = [self.model.id2label[cid] for cid in cls_ids]
|
||||
|
||||
return new_results[0]
|
||||
return {'scores': probs, 'labels': cls_names}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.nlp import PalmForTextGenerationModel
|
||||
from modelscope.models.nlp import PalmForTextGeneration
|
||||
from modelscope.preprocessors import TextGenerationPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Pipeline, Tensor
|
||||
@@ -10,11 +10,11 @@ from ..builder import PIPELINES
|
||||
__all__ = ['TextGenerationPipeline']
|
||||
|
||||
|
||||
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm')
|
||||
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0')
|
||||
class TextGenerationPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[PalmForTextGenerationModel, str],
|
||||
model: Union[PalmForTextGeneration, str],
|
||||
preprocessor: Optional[TextGenerationPreprocessor] = None,
|
||||
**kwargs):
|
||||
"""use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
|
||||
@@ -23,16 +23,16 @@ class TextGenerationPipeline(Pipeline):
|
||||
model (SequenceClassificationModel): a model instance
|
||||
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
|
||||
"""
|
||||
sc_model = model if isinstance(
|
||||
model,
|
||||
PalmForTextGenerationModel) else Model.from_pretrained(model)
|
||||
model = model if isinstance(
|
||||
model, PalmForTextGeneration) else Model.from_pretrained(model)
|
||||
if preprocessor is None:
|
||||
preprocessor = TextGenerationPreprocessor(
|
||||
sc_model.model_dir,
|
||||
model.model_dir,
|
||||
model.tokenizer,
|
||||
first_sequence='sentence',
|
||||
second_sequence=None)
|
||||
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
|
||||
self.tokenizer = preprocessor.tokenizer
|
||||
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
|
||||
self.tokenizer = model.tokenizer
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
|
||||
"""process the prediction results
|
||||
@@ -43,17 +43,20 @@ class TextGenerationPipeline(Pipeline):
|
||||
Returns:
|
||||
Dict[str, str]: the prediction results
|
||||
"""
|
||||
replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
|
||||
('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
|
||||
('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
|
||||
replace_tokens_roberta = ((r' +', ' '), ('<mask>', '<q>'), ('<pad>',
|
||||
''),
|
||||
('<s>', ''), ('</s>', ''), ('<unk>', ' '))
|
||||
|
||||
vocab_size = len(self.tokenizer.vocab)
|
||||
pred_list = inputs['predictions']
|
||||
pred_ids = pred_list[0][0].cpu().numpy().tolist()
|
||||
for j in range(len(pred_ids)):
|
||||
if pred_ids[j] >= vocab_size:
|
||||
pred_ids[j] = 100
|
||||
pred = self.tokenizer.convert_ids_to_tokens(pred_ids)
|
||||
pred_string = ''.join(pred).replace(
|
||||
'##',
|
||||
'').split('[SEP]')[0].replace('[CLS]',
|
||||
'').replace('[SEP]',
|
||||
'').replace('[UNK]', '')
|
||||
return {'pred_string': pred_string}
|
||||
pred_string = self.tokenizer.decode(pred_ids)
|
||||
for _old, _new in replace_tokens_bert:
|
||||
pred_string = pred_string.replace(_old, _new)
|
||||
pred_string.strip()
|
||||
for _old, _new in replace_tokens_roberta:
|
||||
pred_string = pred_string.replace(_old, _new)
|
||||
pred_string.strip()
|
||||
return {'text': pred_string}
|
||||
|
||||
69
modelscope/pipelines/nlp/word_segmentation_pipeline.py
Normal file
69
modelscope/pipelines/nlp/word_segmentation_pipeline.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.nlp import StructBertForTokenClassification
|
||||
from modelscope.preprocessors import TokenClassifcationPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from ..base import Pipeline, Tensor
|
||||
from ..builder import PIPELINES
|
||||
|
||||
__all__ = ['WordSegmentationPipeline']
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.word_segmentation,
|
||||
module_name=r'structbert-chinese-word-segmentation')
|
||||
class WordSegmentationPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[StructBertForTokenClassification, str],
|
||||
preprocessor: Optional[TokenClassifcationPreprocessor] = None,
|
||||
**kwargs):
|
||||
"""use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
|
||||
|
||||
Args:
|
||||
model (StructBertForTokenClassification): a model instance
|
||||
preprocessor (TokenClassifcationPreprocessor): a preprocessor instance
|
||||
"""
|
||||
model = model if isinstance(
|
||||
model,
|
||||
StructBertForTokenClassification) else Model.from_pretrained(model)
|
||||
if preprocessor is None:
|
||||
preprocessor = TokenClassifcationPreprocessor(model.model_dir)
|
||||
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
|
||||
self.tokenizer = preprocessor.tokenizer
|
||||
self.config = model.config
|
||||
self.id2label = self.config.id2label
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""process the prediction results
|
||||
|
||||
Args:
|
||||
inputs (Dict[str, Any]): _description_
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: the prediction results
|
||||
"""
|
||||
|
||||
pred_list = inputs['predictions']
|
||||
labels = []
|
||||
for pre in pred_list:
|
||||
labels.append(self.id2label[pre])
|
||||
labels = labels[1:-1]
|
||||
chunks = []
|
||||
chunk = ''
|
||||
assert len(inputs['text']) == len(labels)
|
||||
for token, label in zip(inputs['text'], labels):
|
||||
if label[0] == 'B' or label[0] == 'I':
|
||||
chunk += token
|
||||
else:
|
||||
chunk += token
|
||||
chunks.append(chunk)
|
||||
chunk = ''
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
seg_result = ' '.join(chunks)
|
||||
rst = {
|
||||
'output': seg_result,
|
||||
}
|
||||
return rst
|
||||
117
modelscope/pipelines/outputs.py
Normal file
117
modelscope/pipelines/outputs.py
Normal file
@@ -0,0 +1,117 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
TASK_OUTPUTS = {
|
||||
|
||||
# ============ vision tasks ===================
|
||||
|
||||
# image classification result for single sample
|
||||
# {
|
||||
# "labels": ["dog", "horse", "cow", "cat"],
|
||||
# "scores": [0.9, 0.1, 0.05, 0.05]
|
||||
# }
|
||||
Tasks.image_classification: ['scores', 'labels'],
|
||||
Tasks.image_tagging: ['scores', 'labels'],
|
||||
|
||||
# object detection result for single sample
|
||||
# {
|
||||
# "boxes": [
|
||||
# [x1, y1, x2, y2],
|
||||
# [x1, y1, x2, y2],
|
||||
# [x1, y1, x2, y2],
|
||||
# ],
|
||||
# "labels": ["dog", "horse", "cow", "cat"],
|
||||
# "scores": [0.9, 0.1, 0.05, 0.05]
|
||||
# }
|
||||
Tasks.object_detection: ['scores', 'labels', 'boxes'],
|
||||
|
||||
# instance segmentation result for single sample
|
||||
# {
|
||||
# "masks": [
|
||||
# np.array in bgr channel order
|
||||
# ],
|
||||
# "labels": ["dog", "horse", "cow", "cat"],
|
||||
# "scores": [0.9, 0.1, 0.05, 0.05]
|
||||
# }
|
||||
Tasks.image_segmentation: ['scores', 'labels', 'boxes'],
|
||||
|
||||
# image generation/editing/matting result for single sample
|
||||
# {
|
||||
# "output_png": np.array with shape(h, w, 4)
|
||||
# for matting or (h, w, 3) for general purpose
|
||||
# }
|
||||
Tasks.image_editing: ['output_png'],
|
||||
Tasks.image_matting: ['output_png'],
|
||||
Tasks.image_generation: ['output_png'],
|
||||
|
||||
# pose estimation result for single sample
|
||||
# {
|
||||
# "poses": np.array with shape [num_pose, num_keypoint, 3],
|
||||
# each keypoint is a array [x, y, score]
|
||||
# "boxes": np.array with shape [num_pose, 4], each box is
|
||||
# [x1, y1, x2, y2]
|
||||
# }
|
||||
Tasks.pose_estimation: ['poses', 'boxes'],
|
||||
|
||||
# ============ nlp tasks ===================
|
||||
|
||||
# text classification result for single sample
|
||||
# {
|
||||
# "labels": ["happy", "sad", "calm", "angry"],
|
||||
# "scores": [0.9, 0.1, 0.05, 0.05]
|
||||
# }
|
||||
Tasks.text_classification: ['scores', 'labels'],
|
||||
|
||||
# text generation result for single sample
|
||||
# {
|
||||
# "text": "this is text generated by a model."
|
||||
# }
|
||||
Tasks.text_generation: ['text'],
|
||||
|
||||
# word segmentation result for single sample
|
||||
# {
|
||||
# "output": "今天 天气 不错 , 适合 出去 游玩"
|
||||
# }
|
||||
Tasks.word_segmentation: ['output'],
|
||||
|
||||
# sentence similarity result for single sample
|
||||
# {
|
||||
# "labels": "1",
|
||||
# "scores": 0.9
|
||||
# }
|
||||
Tasks.sentence_similarity: ['scores', 'labels'],
|
||||
|
||||
# ============ audio tasks ===================
|
||||
|
||||
# audio processed for single file in PCM format
|
||||
# {
|
||||
# "output_pcm": np.array with shape(samples,) and dtype float32
|
||||
# }
|
||||
Tasks.speech_signal_process: ['output_pcm'],
|
||||
|
||||
# ============ multi-modal tasks ===================
|
||||
|
||||
# image caption result for single sample
|
||||
# {
|
||||
# "caption": "this is an image caption text."
|
||||
# }
|
||||
Tasks.image_captioning: ['caption'],
|
||||
|
||||
# visual grounding result for single sample
|
||||
# {
|
||||
# "boxes": [
|
||||
# [x1, y1, x2, y2],
|
||||
# [x1, y1, x2, y2],
|
||||
# [x1, y1, x2, y2],
|
||||
# ],
|
||||
# "scores": [0.9, 0.1, 0.05, 0.05]
|
||||
# }
|
||||
Tasks.visual_grounding: ['boxes', 'scores'],
|
||||
|
||||
# text_to_image result for a single sample
|
||||
# {
|
||||
# "image": np.ndarray with shape [height, width, 3]
|
||||
# }
|
||||
Tasks.text_to_image_synthesis: ['image']
|
||||
}
|
||||
@@ -1,12 +1,23 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
import os.path as osp
|
||||
from typing import List, Union
|
||||
|
||||
import json
|
||||
from maas_hub.file_download import model_file_download
|
||||
|
||||
from modelscope.utils.constant import CONFIGFILE
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.constant import ModelFile
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def is_config_has_model(cfg_file):
|
||||
try:
|
||||
cfg = Config.from_file(cfg_file)
|
||||
return hasattr(cfg, 'model')
|
||||
except Exception as e:
|
||||
logger.error(f'parse config file {cfg_file} failed: {e}')
|
||||
return False
|
||||
|
||||
|
||||
def is_model_name(model: Union[str, List]):
|
||||
@@ -15,24 +26,17 @@ def is_model_name(model: Union[str, List]):
|
||||
|
||||
def is_model_name_impl(model):
|
||||
if osp.exists(model):
|
||||
if osp.exists(osp.join(model, CONFIGFILE)):
|
||||
return True
|
||||
cfg_file = osp.join(model, ModelFile.CONFIGURATION)
|
||||
if osp.exists(cfg_file):
|
||||
return is_config_has_model(cfg_file)
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
# try:
|
||||
# cfg_file = model_file_download(model, CONFIGFILE)
|
||||
# except Exception:
|
||||
# cfg_file = None
|
||||
# TODO @wenmeng.zwm use exception instead of
|
||||
# following tricky logic
|
||||
cfg_file = model_file_download(model, CONFIGFILE)
|
||||
with open(cfg_file, 'r') as infile:
|
||||
cfg = json.load(infile)
|
||||
if 'Code' in cfg:
|
||||
try:
|
||||
cfg_file = model_file_download(model, ModelFile.CONFIGURATION)
|
||||
return is_config_has_model(cfg_file)
|
||||
except Exception:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
if isinstance(model, str):
|
||||
return is_model_name_impl(model)
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from .audio import LinearAECAndFbank
|
||||
from .base import Preprocessor
|
||||
from .builder import PREPROCESSORS, build_preprocessor
|
||||
from .common import Compose
|
||||
from .image import LoadImage, load_image
|
||||
from .nlp import * # noqa F403
|
||||
from .nlp import TextGenerationPreprocessor
|
||||
from .space.dialog_intent_prediction_preprocessor import * # noqa F403
|
||||
from .space.dialog_modeling_preprocessor import * # noqa F403
|
||||
|
||||
230
modelscope/preprocessors/audio.py
Normal file
230
modelscope/preprocessors/audio.py
Normal file
@@ -0,0 +1,230 @@
|
||||
import ctypes
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
import scipy.io.wavfile as wav
|
||||
import torch
|
||||
import torchaudio.compliance.kaldi as kaldi
|
||||
from numpy.ctypeslib import ndpointer
|
||||
|
||||
from modelscope.utils.constant import Fields
|
||||
from .builder import PREPROCESSORS
|
||||
|
||||
|
||||
def load_wav(path):
|
||||
samp_rate, data = wav.read(path)
|
||||
return np.float32(data), samp_rate
|
||||
|
||||
|
||||
def load_library(libaec):
|
||||
libaec_in_cwd = os.path.join('.', libaec)
|
||||
if os.path.exists(libaec_in_cwd):
|
||||
libaec = libaec_in_cwd
|
||||
mitaec = ctypes.cdll.LoadLibrary(libaec)
|
||||
fe_process = mitaec.fe_process_inst
|
||||
fe_process.argtypes = [
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int,
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
|
||||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS')
|
||||
]
|
||||
return fe_process
|
||||
|
||||
|
||||
def do_linear_aec(fe_process, mic, ref, int16range=True):
|
||||
mic = np.float32(mic)
|
||||
ref = np.float32(ref)
|
||||
if len(mic) > len(ref):
|
||||
mic = mic[:len(ref)]
|
||||
out_mic = np.zeros_like(mic)
|
||||
out_linear = np.zeros_like(mic)
|
||||
out_echo = np.zeros_like(mic)
|
||||
out_ref = np.zeros_like(mic)
|
||||
if int16range:
|
||||
mic /= 32768
|
||||
ref /= 32768
|
||||
fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo)
|
||||
# out_ref not in use here
|
||||
if int16range:
|
||||
out_mic *= 32768
|
||||
out_linear *= 32768
|
||||
out_echo *= 32768
|
||||
return out_mic, out_ref, out_linear, out_echo
|
||||
|
||||
|
||||
def load_kaldi_feature_transform(filename):
|
||||
fp = open(filename, 'r')
|
||||
all_str = fp.read()
|
||||
pos1 = all_str.find('AddShift')
|
||||
pos2 = all_str.find('[', pos1)
|
||||
pos3 = all_str.find(']', pos2)
|
||||
mean = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
|
||||
pos1 = all_str.find('Rescale')
|
||||
pos2 = all_str.find('[', pos1)
|
||||
pos3 = all_str.find(']', pos2)
|
||||
scale = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
|
||||
fp.close()
|
||||
return mean, scale
|
||||
|
||||
|
||||
class Feature:
|
||||
r"""Extract feat from one utterance.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
fbank_config,
|
||||
feat_type='spec',
|
||||
mvn_file=None,
|
||||
cuda=False):
|
||||
r"""
|
||||
|
||||
Args:
|
||||
fbank_config (dict):
|
||||
feat_type (str):
|
||||
raw: do nothing
|
||||
fbank: use kaldi.fbank
|
||||
spec: Real/Imag
|
||||
logpow: log(1+|x|^2)
|
||||
mvn_file (str): the path of data file for mean variance normalization
|
||||
cuda:
|
||||
"""
|
||||
self.fbank_config = fbank_config
|
||||
self.feat_type = feat_type
|
||||
self.n_fft = fbank_config['frame_length'] * fbank_config[
|
||||
'sample_frequency'] // 1000
|
||||
self.hop_length = fbank_config['frame_shift'] * fbank_config[
|
||||
'sample_frequency'] // 1000
|
||||
self.window = torch.hamming_window(self.n_fft, periodic=False)
|
||||
|
||||
self.mvn = False
|
||||
if mvn_file is not None and os.path.exists(mvn_file):
|
||||
print(f'loading mvn file: {mvn_file}')
|
||||
shift, scale = load_kaldi_feature_transform(mvn_file)
|
||||
self.shift = torch.from_numpy(shift)
|
||||
self.scale = torch.from_numpy(scale)
|
||||
self.mvn = True
|
||||
if cuda:
|
||||
self.window = self.window.cuda()
|
||||
if self.mvn:
|
||||
self.shift = self.shift.cuda()
|
||||
self.scale = self.scale.cuda()
|
||||
|
||||
def compute(self, utt):
|
||||
r"""
|
||||
|
||||
Args:
|
||||
utt: in [-32768, 32767] range
|
||||
|
||||
Returns:
|
||||
[..., T, F]
|
||||
"""
|
||||
if self.feat_type == 'raw':
|
||||
return utt
|
||||
elif self.feat_type == 'fbank':
|
||||
if len(utt.shape) == 1:
|
||||
utt = utt.unsqueeze(0)
|
||||
feat = kaldi.fbank(utt, **self.fbank_config)
|
||||
elif self.feat_type == 'spec':
|
||||
spec = torch.stft(
|
||||
utt / 32768,
|
||||
self.n_fft,
|
||||
self.hop_length,
|
||||
self.n_fft,
|
||||
self.window,
|
||||
center=False,
|
||||
return_complex=True)
|
||||
feat = torch.cat([spec.real, spec.imag], dim=-2).permute(-1, -2)
|
||||
elif self.feat_type == 'logpow':
|
||||
spec = torch.stft(
|
||||
utt,
|
||||
self.n_fft,
|
||||
self.hop_length,
|
||||
self.n_fft,
|
||||
self.window,
|
||||
center=False,
|
||||
return_complex=True)
|
||||
abspow = torch.abs(spec)**2
|
||||
feat = torch.log(1 + abspow).permute(-1, -2)
|
||||
return feat
|
||||
|
||||
def normalize(self, feat):
|
||||
if self.mvn:
|
||||
feat = feat + self.shift
|
||||
feat = feat * self.scale
|
||||
return feat
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(Fields.audio)
|
||||
class LinearAECAndFbank:
|
||||
SAMPLE_RATE = 16000
|
||||
|
||||
def __init__(self, io_config):
|
||||
self.trunc_length = 7200 * self.SAMPLE_RATE
|
||||
self.linear_aec_delay = io_config['linear_aec_delay']
|
||||
self.feature = Feature(io_config['fbank_config'],
|
||||
io_config['feat_type'], io_config['mvn'])
|
||||
self.mitaec = load_library(io_config['mitaec_library'])
|
||||
self.mask_on_mic = io_config['mask_on'] == 'nearend_mic'
|
||||
|
||||
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
""" linear filtering the near end mic and far end audio, then extract the feature
|
||||
:param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech"
|
||||
:return: dict with two keys and Tensor values: "base" linear filtered audio,and "feature"
|
||||
"""
|
||||
# read files
|
||||
nearend_mic, fs = load_wav(data['nearend_mic'])
|
||||
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
|
||||
farend_speech, fs = load_wav(data['farend_speech'])
|
||||
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
|
||||
if 'nearend_speech' in data:
|
||||
nearend_speech, fs = load_wav(data['nearend_speech'])
|
||||
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
|
||||
else:
|
||||
nearend_speech = np.zeros_like(nearend_mic)
|
||||
|
||||
out_mic, out_ref, out_linear, out_echo = do_linear_aec(
|
||||
self.mitaec, nearend_mic, farend_speech)
|
||||
# fix 20ms linear aec delay by delaying the target speech
|
||||
extra_zeros = np.zeros([int(self.linear_aec_delay * fs)])
|
||||
nearend_speech = np.concatenate([extra_zeros, nearend_speech])
|
||||
# truncate files to the same length
|
||||
flen = min(
|
||||
len(out_mic), len(out_ref), len(out_linear), len(out_echo),
|
||||
len(nearend_speech))
|
||||
fstart = 0
|
||||
flen = min(flen, self.trunc_length)
|
||||
nearend_mic, out_ref, out_linear, out_echo, nearend_speech = (
|
||||
out_mic[fstart:flen], out_ref[fstart:flen],
|
||||
out_linear[fstart:flen], out_echo[fstart:flen],
|
||||
nearend_speech[fstart:flen])
|
||||
|
||||
# extract features (frames, [mic, linear, ref, aes?])
|
||||
feat = torch.FloatTensor()
|
||||
|
||||
nearend_mic = torch.from_numpy(np.float32(nearend_mic))
|
||||
fbank_nearend_mic = self.feature.compute(nearend_mic)
|
||||
feat = torch.cat([feat, fbank_nearend_mic], dim=1)
|
||||
|
||||
out_linear = torch.from_numpy(np.float32(out_linear))
|
||||
fbank_out_linear = self.feature.compute(out_linear)
|
||||
feat = torch.cat([feat, fbank_out_linear], dim=1)
|
||||
|
||||
out_echo = torch.from_numpy(np.float32(out_echo))
|
||||
fbank_out_echo = self.feature.compute(out_echo)
|
||||
feat = torch.cat([feat, fbank_out_echo], dim=1)
|
||||
|
||||
# feature transform
|
||||
feat = self.feature.normalize(feat)
|
||||
|
||||
# prepare target
|
||||
if nearend_speech is not None:
|
||||
nearend_speech = torch.from_numpy(np.float32(nearend_speech))
|
||||
|
||||
if self.mask_on_mic:
|
||||
base = nearend_mic
|
||||
else:
|
||||
base = out_linear
|
||||
out_data = {'base': base, 'target': nearend_speech, 'feature': feat}
|
||||
return out_data
|
||||
@@ -9,7 +9,7 @@ from modelscope.utils.constant import Fields
|
||||
from .builder import PREPROCESSORS
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(Fields.image)
|
||||
@PREPROCESSORS.register_module(Fields.cv)
|
||||
class LoadImage:
|
||||
"""Load an image from file or url.
|
||||
Added or updated keys are "filename", "img", "img_shape",
|
||||
|
||||
@@ -11,8 +11,8 @@ from .base import Preprocessor
|
||||
from .builder import PREPROCESSORS
|
||||
|
||||
__all__ = [
|
||||
'Tokenize',
|
||||
'SequenceClassificationPreprocessor',
|
||||
'Tokenize', 'SequenceClassificationPreprocessor',
|
||||
'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor'
|
||||
]
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ class Tokenize(Preprocessor):
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
Fields.nlp, module_name=r'bert-sentiment-analysis')
|
||||
Fields.nlp, module_name=r'bert-sequence-classification')
|
||||
class SequenceClassificationPreprocessor(Preprocessor):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
@@ -51,21 +51,42 @@ class SequenceClassificationPreprocessor(Preprocessor):
|
||||
self.sequence_length = kwargs.pop('sequence_length', 128)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
|
||||
print(f'this is the tokenzier {self.tokenizer}')
|
||||
|
||||
@type_assert(object, str)
|
||||
def __call__(self, data: str) -> Dict[str, Any]:
|
||||
@type_assert(object, (str, tuple))
|
||||
def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
|
||||
"""process the raw input data
|
||||
|
||||
Args:
|
||||
data (str): a sentence
|
||||
Example:
|
||||
'you are so handsome.'
|
||||
data (str or tuple):
|
||||
sentence1 (str): a sentence
|
||||
Example:
|
||||
'you are so handsome.'
|
||||
or
|
||||
(sentence1, sentence2)
|
||||
sentence1 (str): a sentence
|
||||
Example:
|
||||
'you are so handsome.'
|
||||
sentence2 (str): a sentence
|
||||
Example:
|
||||
'you are so beautiful.'
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: the preprocessed data
|
||||
"""
|
||||
|
||||
new_data = {self.first_sequence: data}
|
||||
if not isinstance(data, tuple):
|
||||
data = (
|
||||
data,
|
||||
None,
|
||||
)
|
||||
|
||||
sentence1, sentence2 = data
|
||||
new_data = {
|
||||
self.first_sequence: sentence1,
|
||||
self.second_sequence: sentence2
|
||||
}
|
||||
|
||||
# preprocess the data for the model input
|
||||
|
||||
rst = {
|
||||
@@ -94,17 +115,15 @@ class SequenceClassificationPreprocessor(Preprocessor):
|
||||
return rst
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm')
|
||||
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0')
|
||||
class TextGenerationPreprocessor(Preprocessor):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
|
||||
"""preprocess the data using the vocab.txt from the `model_dir` path
|
||||
|
||||
Args:
|
||||
model_dir (str): model path
|
||||
"""
|
||||
from sofa import PalmTokenizer
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.model_dir: str = model_dir
|
||||
@@ -113,7 +132,7 @@ class TextGenerationPreprocessor(Preprocessor):
|
||||
self.second_sequence: str = kwargs.pop('second_sequence',
|
||||
'second_sequence')
|
||||
self.sequence_length: int = kwargs.pop('sequence_length', 128)
|
||||
self.tokenizer = PalmTokenizer.from_pretrained(model_dir)
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
@type_assert(object, str)
|
||||
def __call__(self, data: str) -> Dict[str, Any]:
|
||||
@@ -132,7 +151,7 @@ class TextGenerationPreprocessor(Preprocessor):
|
||||
new_data = {self.first_sequence: data}
|
||||
# preprocess the data for the model input
|
||||
|
||||
rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
|
||||
rst = {'input_ids': [], 'attention_mask': []}
|
||||
|
||||
max_seq_length = self.sequence_length
|
||||
|
||||
@@ -147,6 +166,53 @@ class TextGenerationPreprocessor(Preprocessor):
|
||||
|
||||
rst['input_ids'].append(feature['input_ids'])
|
||||
rst['attention_mask'].append(feature['attention_mask'])
|
||||
rst['token_type_ids'].append(feature['token_type_ids'])
|
||||
|
||||
return {k: torch.tensor(v) for k, v in rst.items()}
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
Fields.nlp, module_name=r'bert-token-classification')
|
||||
class TokenClassifcationPreprocessor(Preprocessor):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""preprocess the data via the vocab.txt from the `model_dir` path
|
||||
|
||||
Args:
|
||||
model_dir (str): model path
|
||||
"""
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
from sofa import SbertTokenizer
|
||||
self.model_dir: str = model_dir
|
||||
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
|
||||
|
||||
@type_assert(object, str)
|
||||
def __call__(self, data: str) -> Dict[str, Any]:
|
||||
"""process the raw input data
|
||||
|
||||
Args:
|
||||
data (str): a sentence
|
||||
Example:
|
||||
'you are so handsome.'
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: the preprocessed data
|
||||
"""
|
||||
# preprocess the data for the model input
|
||||
|
||||
text = data.replace(' ', '').strip()
|
||||
tokens = []
|
||||
for token in text:
|
||||
token = self.tokenizer.tokenize(token)
|
||||
tokens.extend(token)
|
||||
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
|
||||
attention_mask = [1] * len(input_ids)
|
||||
token_type_ids = [0] * len(input_ids)
|
||||
return {
|
||||
'text': text,
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'token_type_ids': token_type_ids
|
||||
}
|
||||
|
||||
@@ -74,17 +74,17 @@ class Config:
|
||||
{'c': [1, 2, 3], 'd': 'dd'}
|
||||
>>> cfg.b.d
|
||||
'dd'
|
||||
>>> cfg = Config.from_file('configs/examples/config.json')
|
||||
>>> cfg = Config.from_file('configs/examples/configuration.json')
|
||||
>>> cfg.filename
|
||||
'configs/examples/config.json'
|
||||
'configs/examples/configuration.json'
|
||||
>>> cfg.b
|
||||
{'c': [1, 2, 3], 'd': 'dd'}
|
||||
>>> cfg = Config.from_file('configs/examples/config.py')
|
||||
>>> cfg = Config.from_file('configs/examples/configuration.py')
|
||||
>>> cfg.filename
|
||||
"configs/examples/config.py"
|
||||
>>> cfg = Config.from_file('configs/examples/config.yaml')
|
||||
"configs/examples/configuration.py"
|
||||
>>> cfg = Config.from_file('configs/examples/configuration.yaml')
|
||||
>>> cfg.filename
|
||||
"configs/examples/config.yaml"
|
||||
"configs/examples/configuration.yaml"
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
class Fields(object):
|
||||
""" Names for different application fields
|
||||
"""
|
||||
image = 'image'
|
||||
video = 'video'
|
||||
# image = 'image'
|
||||
# video = 'video'
|
||||
cv = 'cv'
|
||||
nlp = 'nlp'
|
||||
audio = 'audio'
|
||||
@@ -30,7 +30,9 @@ class Tasks(object):
|
||||
image_matting = 'image-matting'
|
||||
|
||||
# nlp tasks
|
||||
word_segmentation = 'word-segmentation'
|
||||
sentiment_analysis = 'sentiment-analysis'
|
||||
sentence_similarity = 'sentence-similarity'
|
||||
text_classification = 'text-classification'
|
||||
relation_extraction = 'relation-extraction'
|
||||
zero_shot = 'zero-shot'
|
||||
@@ -52,7 +54,7 @@ class Tasks(object):
|
||||
text_to_speech = 'text-to-speech'
|
||||
speech_signal_process = 'speech-signal-process'
|
||||
|
||||
# multi-media
|
||||
# multi-modal tasks
|
||||
image_captioning = 'image-captioning'
|
||||
visual_grounding = 'visual-grounding'
|
||||
text_to_image_synthesis = 'text-to-image-synthesis'
|
||||
@@ -73,16 +75,16 @@ class Hubs(object):
|
||||
huggingface = 'huggingface'
|
||||
|
||||
|
||||
# configuration filename
|
||||
# in order to avoid conflict with huggingface
|
||||
# config file we use maas_config instead
|
||||
CONFIGFILE = 'maas_config.json'
|
||||
class ModelFile(object):
|
||||
CONFIGURATION = 'configuration.json'
|
||||
README = 'README.md'
|
||||
TF_SAVED_MODEL_FILE = 'saved_model.pb'
|
||||
TF_GRAPH_FILE = 'tf_graph.pb'
|
||||
TF_CHECKPOINT_FOLDER = 'tf_ckpts'
|
||||
TF_CKPT_PREFIX = 'ckpt-'
|
||||
TORCH_MODEL_FILE = 'pytorch_model.pt'
|
||||
TORCH_MODEL_BIN_FILE = 'pytorch_model.bin'
|
||||
|
||||
|
||||
README_FILE = 'README.md'
|
||||
TF_SAVED_MODEL_FILE = 'saved_model.pb'
|
||||
TF_GRAPH_FILE = 'tf_graph.pb'
|
||||
TF_CHECKPOINT_FOLDER = 'tf_ckpts'
|
||||
TF_CHECKPOINT_FILE = 'checkpoint'
|
||||
TORCH_MODEL_FILE = 'pytorch_model.bin'
|
||||
TENSORFLOW = 'tensorflow'
|
||||
PYTORCH = 'pytorch'
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import inspect
|
||||
from email.policy import default
|
||||
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
@@ -70,6 +69,7 @@ class Registry(object):
|
||||
f'{self._name}[{group_key}]')
|
||||
|
||||
self._modules[group_key][module_name] = module_cls
|
||||
module_cls.group_key = group_key
|
||||
|
||||
if module_name in self._modules[default_group]:
|
||||
if id(self._modules[default_group][module_name]) == id(module_cls):
|
||||
|
||||
20
modelscope/utils/test_utils.py
Normal file
20
modelscope/utils/test_utils.py
Normal file
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
|
||||
TEST_LEVEL = 2
|
||||
TEST_LEVEL_STR = 'TEST_LEVEL'
|
||||
|
||||
|
||||
def test_level():
|
||||
global TEST_LEVEL
|
||||
if TEST_LEVEL_STR in os.environ:
|
||||
TEST_LEVEL = int(os.environ[TEST_LEVEL_STR])
|
||||
|
||||
return TEST_LEVEL
|
||||
|
||||
|
||||
def set_test_level(level: int):
|
||||
global TEST_LEVEL
|
||||
TEST_LEVEL = level
|
||||
@@ -1,6 +1,7 @@
|
||||
docutils==0.16.0
|
||||
recommonmark
|
||||
sphinx==4.0.2
|
||||
sphinx-book-theme
|
||||
sphinx-copybutton
|
||||
sphinx_markdown_tables
|
||||
sphinx_rtd_theme==0.5.2
|
||||
|
||||
@@ -1 +1 @@
|
||||
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.1.3-py3-none-any.whl
|
||||
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.2-py3-none-any.whl
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
addict
|
||||
datasets
|
||||
easydict
|
||||
https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl
|
||||
https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl
|
||||
numpy
|
||||
opencv-python-headless
|
||||
Pillow
|
||||
Pillow>=6.2.0
|
||||
pyyaml
|
||||
requests
|
||||
scipy
|
||||
tokenizers<=0.10.3
|
||||
transformers<=4.16.2
|
||||
yapf
|
||||
|
||||
@@ -11,6 +11,7 @@ default_section = THIRDPARTY
|
||||
BASED_ON_STYLE = pep8
|
||||
BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
|
||||
SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
|
||||
SPLIT_BEFORE_ARITHMETIC_OPERATOR = true
|
||||
|
||||
[codespell]
|
||||
skip = *.ipynb
|
||||
@@ -20,5 +21,5 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
|
||||
[flake8]
|
||||
select = B,C,E,F,P,T4,W,B9
|
||||
max-line-length = 120
|
||||
ignore = F401,F821
|
||||
ignore = F401,F821,W503
|
||||
exclude = docs/src,*.pyi,.git
|
||||
|
||||
@@ -35,9 +35,10 @@ class CustomPipelineTest(unittest.TestCase):
|
||||
CustomPipeline1()
|
||||
|
||||
def test_custom(self):
|
||||
dummy_task = 'dummy-task'
|
||||
|
||||
@PIPELINES.register_module(
|
||||
group_key=Tasks.image_tagging, module_name='custom-image')
|
||||
group_key=dummy_task, module_name='custom-image')
|
||||
class CustomImagePipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
@@ -67,32 +68,28 @@ class CustomPipelineTest(unittest.TestCase):
|
||||
outputs['filename'] = inputs['url']
|
||||
img = inputs['img']
|
||||
new_image = img.resize((img.width // 2, img.height // 2))
|
||||
outputs['resize_image'] = np.array(new_image)
|
||||
outputs['dummy_result'] = 'dummy_result'
|
||||
outputs['output_png'] = np.array(new_image)
|
||||
return outputs
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return inputs
|
||||
|
||||
self.assertTrue('custom-image' in PIPELINES.modules[default_group])
|
||||
add_default_pipeline_info(Tasks.image_tagging, 'custom-image')
|
||||
add_default_pipeline_info(dummy_task, 'custom-image', overwrite=True)
|
||||
pipe = pipeline(pipeline_name='custom-image')
|
||||
pipe2 = pipeline(Tasks.image_tagging)
|
||||
pipe2 = pipeline(dummy_task)
|
||||
self.assertTrue(type(pipe) is type(pipe2))
|
||||
|
||||
img_url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.' \
|
||||
'aliyuncs.com/data/test/images/image1.jpg'
|
||||
img_url = 'data/test/images/image1.jpg'
|
||||
output = pipe(img_url)
|
||||
self.assertEqual(output['filename'], img_url)
|
||||
self.assertEqual(output['resize_image'].shape, (318, 512, 3))
|
||||
self.assertEqual(output['dummy_result'], 'dummy_result')
|
||||
self.assertEqual(output['output_png'].shape, (318, 512, 3))
|
||||
|
||||
outputs = pipe([img_url for i in range(4)])
|
||||
self.assertEqual(len(outputs), 4)
|
||||
for out in outputs:
|
||||
self.assertEqual(out['filename'], img_url)
|
||||
self.assertEqual(out['resize_image'].shape, (318, 512, 3))
|
||||
self.assertEqual(out['dummy_result'], 'dummy_result')
|
||||
self.assertEqual(out['output_png'].shape, (318, 512, 3))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -7,11 +7,12 @@ import unittest
|
||||
from modelscope.fileio import File
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class ImageCaptionTest(unittest.TestCase):
|
||||
|
||||
@unittest.skip('skip long test')
|
||||
@unittest.skip('skip before model is restored in model hub')
|
||||
def test_run(self):
|
||||
model = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_large_best_clean.pt'
|
||||
|
||||
@@ -26,9 +27,7 @@ class ImageCaptionTest(unittest.TestCase):
|
||||
img_captioning = pipeline(
|
||||
Tasks.image_captioning, model=ofile.name, bpe_dir=bpe_dir)
|
||||
|
||||
result = img_captioning(
|
||||
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
|
||||
)
|
||||
result = img_captioning('data/test/images/image_matting.png')
|
||||
print(result['caption'])
|
||||
|
||||
|
||||
|
||||
@@ -9,14 +9,15 @@ import cv2
|
||||
from modelscope.fileio import File
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.pydatasets import PyDataset
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.hub import get_model_cache_dir
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class ImageMattingTest(unittest.TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.model_id = 'damo/cv_unet_image-matting_damo'
|
||||
self.model_id = 'damo/cv_unet_image-matting'
|
||||
# switch to False if downloading everytime is not desired
|
||||
purge_cache = True
|
||||
if purge_cache:
|
||||
@@ -28,20 +29,17 @@ class ImageMattingTest(unittest.TestCase):
|
||||
model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs' \
|
||||
'.com/data/test/maas/image_matting/matting_person.pb'
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model_file = osp.join(tmp_dir, 'matting_person.pb')
|
||||
model_file = osp.join(tmp_dir, ModelFile.TF_GRAPH_FILE)
|
||||
with open(model_file, 'wb') as ofile:
|
||||
ofile.write(File.read(model_path))
|
||||
img_matting = pipeline(Tasks.image_matting, model=tmp_dir)
|
||||
|
||||
result = img_matting(
|
||||
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
|
||||
)
|
||||
result = img_matting('data/test/images/image_matting.png')
|
||||
cv2.imwrite('result.png', result['output_png'])
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_dataset(self):
|
||||
input_location = [
|
||||
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
|
||||
]
|
||||
input_location = ['data/test/images/image_matting.png']
|
||||
# alternatively:
|
||||
# input_location = '/dir/to/images'
|
||||
|
||||
@@ -52,21 +50,19 @@ class ImageMattingTest(unittest.TestCase):
|
||||
cv2.imwrite('result.png', next(result)['output_png'])
|
||||
print(f'Output written to {osp.abspath("result.png")}')
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_modelhub(self):
|
||||
img_matting = pipeline(Tasks.image_matting, model=self.model_id)
|
||||
|
||||
result = img_matting(
|
||||
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
|
||||
)
|
||||
result = img_matting('data/test/images/image_matting.png')
|
||||
cv2.imwrite('result.png', result['output_png'])
|
||||
print(f'Output written to {osp.abspath("result.png")}')
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_modelhub_default_model(self):
|
||||
img_matting = pipeline(Tasks.image_matting)
|
||||
|
||||
result = img_matting(
|
||||
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
|
||||
)
|
||||
result = img_matting('data/test/images/image_matting.png')
|
||||
cv2.imwrite('result.png', result['output_png'])
|
||||
print(f'Output written to {osp.abspath("result.png")}')
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import cv2
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.pipelines.base import Pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class ImageCartoonTest(unittest.TestCase):
|
||||
@@ -36,10 +37,12 @@ class ImageCartoonTest(unittest.TestCase):
|
||||
img_cartoon = pipeline(Tasks.image_generation, model=model_dir)
|
||||
self.pipeline_inference(img_cartoon, self.test_image)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_modelhub(self):
|
||||
img_cartoon = pipeline(Tasks.image_generation, model=self.model_id)
|
||||
self.pipeline_inference(img_cartoon, self.test_image)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_modelhub_default_model(self):
|
||||
img_cartoon = pipeline(Tasks.image_generation)
|
||||
self.pipeline_inference(img_cartoon, self.test_image)
|
||||
|
||||
67
tests/pipelines/test_sentence_similarity.py
Normal file
67
tests/pipelines/test_sentence_similarity.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import shutil
|
||||
import unittest
|
||||
|
||||
from maas_hub.snapshot_download import snapshot_download
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.nlp import SbertForSentenceSimilarity
|
||||
from modelscope.pipelines import SentenceSimilarityPipeline, pipeline
|
||||
from modelscope.preprocessors import SequenceClassificationPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.hub import get_model_cache_dir
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class SentenceSimilarityTest(unittest.TestCase):
|
||||
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
|
||||
sentence1 = '今天气温比昨天高么?'
|
||||
sentence2 = '今天湿度比昨天高么?'
|
||||
|
||||
def setUp(self) -> None:
|
||||
# switch to False if downloading everytime is not desired
|
||||
purge_cache = True
|
||||
if purge_cache:
|
||||
shutil.rmtree(
|
||||
get_model_cache_dir(self.model_id), ignore_errors=True)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run(self):
|
||||
cache_path = snapshot_download(self.model_id)
|
||||
tokenizer = SequenceClassificationPreprocessor(cache_path)
|
||||
model = SbertForSentenceSimilarity(cache_path, tokenizer=tokenizer)
|
||||
pipeline1 = SentenceSimilarityPipeline(model, preprocessor=tokenizer)
|
||||
pipeline2 = pipeline(
|
||||
Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
|
||||
print('test1')
|
||||
print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
|
||||
f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
|
||||
print()
|
||||
print(
|
||||
f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
|
||||
f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_model_from_modelhub(self):
|
||||
model = Model.from_pretrained(self.model_id)
|
||||
tokenizer = SequenceClassificationPreprocessor(model.model_dir)
|
||||
pipeline_ins = pipeline(
|
||||
task=Tasks.sentence_similarity,
|
||||
model=model,
|
||||
preprocessor=tokenizer)
|
||||
print(pipeline_ins(input=(self.sentence1, self.sentence2)))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_model_name(self):
|
||||
pipeline_ins = pipeline(
|
||||
task=Tasks.sentence_similarity, model=self.model_id)
|
||||
print(pipeline_ins(input=(self.sentence1, self.sentence2)))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_default_model(self):
|
||||
pipeline_ins = pipeline(task=Tasks.sentence_similarity)
|
||||
print(pipeline_ins(input=(self.sentence1, self.sentence2)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
56
tests/pipelines/test_speech_signal_process.py
Normal file
56
tests/pipelines/test_speech_signal_process.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import os.path
|
||||
import shutil
|
||||
import unittest
|
||||
|
||||
from modelscope.fileio import File
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.hub import get_model_cache_dir
|
||||
|
||||
NEAREND_MIC_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/nearend_mic.wav'
|
||||
FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/farend_speech.wav'
|
||||
NEAREND_MIC_FILE = 'nearend_mic.wav'
|
||||
FAREND_SPEECH_FILE = 'farend_speech.wav'
|
||||
|
||||
AEC_LIB_URL = 'http://isv-data.oss-cn-hangzhou.aliyuncs.com/ics%2FMaaS%2FAEC%2Flib%2Flibmitaec_pyio.so' \
|
||||
'?Expires=1664085465&OSSAccessKeyId=LTAIxjQyZNde90zh&Signature=Y7gelmGEsQAJRK4yyHSYMrdWizk%3D'
|
||||
AEC_LIB_FILE = 'libmitaec_pyio.so'
|
||||
|
||||
|
||||
def download(remote_path, local_path):
|
||||
local_dir = os.path.dirname(local_path)
|
||||
if len(local_dir) > 0:
|
||||
if not os.path.exists(local_dir):
|
||||
os.makedirs(local_dir)
|
||||
with open(local_path, 'wb') as ofile:
|
||||
ofile.write(File.read(remote_path))
|
||||
|
||||
|
||||
class SpeechSignalProcessTest(unittest.TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.model_id = 'damo/speech_dfsmn_aec_psm_16k'
|
||||
# switch to False if downloading everytime is not desired
|
||||
purge_cache = True
|
||||
if purge_cache:
|
||||
shutil.rmtree(
|
||||
get_model_cache_dir(self.model_id), ignore_errors=True)
|
||||
# A temporary hack to provide c++ lib. Download it first.
|
||||
download(AEC_LIB_URL, AEC_LIB_FILE)
|
||||
|
||||
def test_run(self):
|
||||
download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
|
||||
download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
|
||||
input = {
|
||||
'nearend_mic': NEAREND_MIC_FILE,
|
||||
'farend_speech': FAREND_SPEECH_FILE
|
||||
}
|
||||
aec = pipeline(
|
||||
Tasks.speech_signal_process,
|
||||
model=self.model_id,
|
||||
pipeline_name=r'speech_dfsmn_aec_psm_16k')
|
||||
aec(input, output_path='output.wav')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -12,6 +12,7 @@ from modelscope.preprocessors import SequenceClassificationPreprocessor
|
||||
from modelscope.pydatasets import PyDataset
|
||||
from modelscope.utils.constant import Hubs, Tasks
|
||||
from modelscope.utils.hub import get_model_cache_dir
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class SequenceClassificationTest(unittest.TestCase):
|
||||
@@ -43,6 +44,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
break
|
||||
print(r)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run(self):
|
||||
model_url = 'https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com' \
|
||||
'/release/easynlp_modelzoo/alibaba-pai/bert-base-sst2.zip'
|
||||
@@ -67,6 +69,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
Tasks.text_classification, model=model, preprocessor=preprocessor)
|
||||
print(pipeline2('Hello world!'))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_model_from_modelhub(self):
|
||||
model = Model.from_pretrained(self.model_id)
|
||||
preprocessor = SequenceClassificationPreprocessor(
|
||||
@@ -77,6 +80,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
preprocessor=preprocessor)
|
||||
self.predict(pipeline_ins)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_model_name(self):
|
||||
text_classification = pipeline(
|
||||
task=Tasks.text_classification, model=self.model_id)
|
||||
@@ -85,6 +89,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
|
||||
self.printDataset(result)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_default_model(self):
|
||||
text_classification = pipeline(task=Tasks.text_classification)
|
||||
result = text_classification(
|
||||
@@ -92,6 +97,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
|
||||
self.printDataset(result)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_dataset(self):
|
||||
model = Model.from_pretrained(self.model_id)
|
||||
preprocessor = SequenceClassificationPreprocessor(
|
||||
|
||||
@@ -4,47 +4,75 @@ import unittest
|
||||
from maas_hub.snapshot_download import snapshot_download
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.nlp import PalmForTextGenerationModel
|
||||
from modelscope.models.nlp import PalmForTextGeneration
|
||||
from modelscope.pipelines import TextGenerationPipeline, pipeline
|
||||
from modelscope.preprocessors import TextGenerationPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class TextGenerationTest(unittest.TestCase):
|
||||
model_id = 'damo/nlp_palm_text-generation_chinese'
|
||||
input1 = "今日天气类型='晴'&温度变化趋势='大幅上升'&最低气温='28℃'&最高气温='31℃'&体感='湿热'"
|
||||
input2 = "今日天气类型='多云'&体感='舒适'&最低气温='26℃'&最高气温='30℃'"
|
||||
model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base'
|
||||
model_id_en = 'damo/nlp_palm2.0_text-generation_english-base'
|
||||
input_zh = """
|
||||
本文总结了十个可穿戴产品的设计原则,而这些原则,同样也是笔者认为是这个行业最吸引人的地方:
|
||||
1.为人们解决重复性问题;2.从人开始,而不是从机器开始;3.要引起注意,但不要刻意;4.提升用户能力,而不是取代
|
||||
"""
|
||||
input_en = """
|
||||
The Director of Public Prosecutions who let off Lord Janner over alleged child sex abuse started
|
||||
her career at a legal chambers when the disgraced Labour peer was a top QC there . Alison Saunders ,
|
||||
54 , sparked outrage last week when she decided the 86-year-old should not face astring of charges
|
||||
of paedophilia against nine children because he has dementia . Today , newly-released documents
|
||||
revealed damning evidence that abuse was covered up by police andsocial workers for more than 20 years .
|
||||
And now it has emerged Mrs Saunders ' law career got off to a flying start when she secured her
|
||||
pupillage -- a barrister 's training contract at 1 Garden Court Chambers in London in 1983 .
|
||||
"""
|
||||
|
||||
@unittest.skip('skip temporarily to save test time')
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run(self):
|
||||
cache_path = snapshot_download(self.model_id)
|
||||
preprocessor = TextGenerationPreprocessor(
|
||||
cache_path, first_sequence='sentence', second_sequence=None)
|
||||
model = PalmForTextGenerationModel(
|
||||
cache_path, tokenizer=preprocessor.tokenizer)
|
||||
pipeline1 = TextGenerationPipeline(model, preprocessor)
|
||||
pipeline2 = pipeline(
|
||||
Tasks.text_generation, model=model, preprocessor=preprocessor)
|
||||
print(f'input: {self.input1}\npipeline1: {pipeline1(self.input1)}')
|
||||
print()
|
||||
print(f'input: {self.input2}\npipeline2: {pipeline2(self.input2)}')
|
||||
for model_id, input in ((self.model_id_zh, self.input_zh),
|
||||
(self.model_id_en, self.input_en)):
|
||||
cache_path = snapshot_download(model_id)
|
||||
model = PalmForTextGeneration(cache_path)
|
||||
preprocessor = TextGenerationPreprocessor(
|
||||
cache_path,
|
||||
model.tokenizer,
|
||||
first_sequence='sentence',
|
||||
second_sequence=None)
|
||||
pipeline1 = TextGenerationPipeline(model, preprocessor)
|
||||
pipeline2 = pipeline(
|
||||
Tasks.text_generation, model=model, preprocessor=preprocessor)
|
||||
print(
|
||||
f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
|
||||
)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_model_from_modelhub(self):
|
||||
model = Model.from_pretrained(self.model_id)
|
||||
preprocessor = TextGenerationPreprocessor(
|
||||
model.model_dir, first_sequence='sentence', second_sequence=None)
|
||||
pipeline_ins = pipeline(
|
||||
task=Tasks.text_generation, model=model, preprocessor=preprocessor)
|
||||
print(pipeline_ins(self.input1))
|
||||
for model_id, input in ((self.model_id_zh, self.input_zh),
|
||||
(self.model_id_en, self.input_en)):
|
||||
model = Model.from_pretrained(model_id)
|
||||
preprocessor = TextGenerationPreprocessor(
|
||||
model.model_dir,
|
||||
model.tokenizer,
|
||||
first_sequence='sentence',
|
||||
second_sequence=None)
|
||||
pipeline_ins = pipeline(
|
||||
task=Tasks.text_generation,
|
||||
model=model,
|
||||
preprocessor=preprocessor)
|
||||
print(pipeline_ins(input))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_model_name(self):
|
||||
pipeline_ins = pipeline(
|
||||
task=Tasks.text_generation, model=self.model_id)
|
||||
print(pipeline_ins(self.input2))
|
||||
for model_id, input in ((self.model_id_zh, self.input_zh),
|
||||
(self.model_id_en, self.input_en)):
|
||||
pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
|
||||
print(pipeline_ins(input))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_default_model(self):
|
||||
pipeline_ins = pipeline(task=Tasks.text_generation)
|
||||
print(pipeline_ins(self.input2))
|
||||
print(pipeline_ins(self.input_zh))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
62
tests/pipelines/test_word_segmentation.py
Normal file
62
tests/pipelines/test_word_segmentation.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import shutil
|
||||
import unittest
|
||||
|
||||
from maas_hub.snapshot_download import snapshot_download
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.nlp import StructBertForTokenClassification
|
||||
from modelscope.pipelines import WordSegmentationPipeline, pipeline
|
||||
from modelscope.preprocessors import TokenClassifcationPreprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.hub import get_model_cache_dir
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class WordSegmentationTest(unittest.TestCase):
|
||||
model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
|
||||
sentence = '今天天气不错,适合出去游玩'
|
||||
|
||||
def setUp(self) -> None:
|
||||
# switch to False if downloading everytime is not desired
|
||||
purge_cache = True
|
||||
if purge_cache:
|
||||
shutil.rmtree(
|
||||
get_model_cache_dir(self.model_id), ignore_errors=True)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run_by_direct_model_download(self):
|
||||
cache_path = snapshot_download(self.model_id)
|
||||
tokenizer = TokenClassifcationPreprocessor(cache_path)
|
||||
model = StructBertForTokenClassification(
|
||||
cache_path, tokenizer=tokenizer)
|
||||
pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
|
||||
pipeline2 = pipeline(
|
||||
Tasks.word_segmentation, model=model, preprocessor=tokenizer)
|
||||
print(f'sentence: {self.sentence}\n'
|
||||
f'pipeline1:{pipeline1(input=self.sentence)}')
|
||||
print()
|
||||
print(f'pipeline2: {pipeline2(input=self.sentence)}')
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_model_from_modelhub(self):
|
||||
model = Model.from_pretrained(self.model_id)
|
||||
tokenizer = TokenClassifcationPreprocessor(model.model_dir)
|
||||
pipeline_ins = pipeline(
|
||||
task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
|
||||
print(pipeline_ins(input=self.sentence))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_model_name(self):
|
||||
pipeline_ins = pipeline(
|
||||
task=Tasks.word_segmentation, model=self.model_id)
|
||||
print(pipeline_ins(input=self.sentence))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_default_model(self):
|
||||
pipeline_ins = pipeline(task=Tasks.word_segmentation)
|
||||
print(pipeline_ins(input=self.sentence))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
20
tests/preprocessors/test_image.py
Normal file
20
tests/preprocessors/test_image.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import unittest
|
||||
|
||||
import PIL
|
||||
|
||||
from modelscope.preprocessors import load_image
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
|
||||
class ImagePreprocessorTest(unittest.TestCase):
|
||||
|
||||
def test_load(self):
|
||||
img = load_image('data/test/images/image_matting.png')
|
||||
self.assertTrue(isinstance(img, PIL.Image.Image))
|
||||
self.assertEqual(img.size, (948, 533))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -7,6 +7,11 @@ import sys
|
||||
import unittest
|
||||
from fnmatch import fnmatch
|
||||
|
||||
from modelscope.utils.logger import get_logger
|
||||
from modelscope.utils.test_utils import set_test_level, test_level
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def gather_test_cases(test_dir, pattern, list_tests):
|
||||
case_list = []
|
||||
@@ -49,5 +54,9 @@ if __name__ == '__main__':
|
||||
'--pattern', default='test_*.py', help='test file pattern')
|
||||
parser.add_argument(
|
||||
'--test_dir', default='tests', help='directory to be tested')
|
||||
parser.add_argument(
|
||||
'--level', default=0, help='2 -- all, 1 -- p1, 0 -- p0')
|
||||
args = parser.parse_args()
|
||||
set_test_level(args.level)
|
||||
logger.info(f'TEST LEVEL: {test_level()}')
|
||||
main(args)
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import argparse
|
||||
import os.path as osp
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from modelscope.fileio import dump, load
|
||||
from modelscope.utils.config import Config
|
||||
|
||||
obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
|
||||
@@ -14,25 +11,25 @@ obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
|
||||
class ConfigTest(unittest.TestCase):
|
||||
|
||||
def test_json(self):
|
||||
config_file = 'configs/examples/config.json'
|
||||
config_file = 'configs/examples/configuration.json'
|
||||
cfg = Config.from_file(config_file)
|
||||
self.assertEqual(cfg.a, 1)
|
||||
self.assertEqual(cfg.b, obj['b'])
|
||||
|
||||
def test_yaml(self):
|
||||
config_file = 'configs/examples/config.yaml'
|
||||
config_file = 'configs/examples/configuration.yaml'
|
||||
cfg = Config.from_file(config_file)
|
||||
self.assertEqual(cfg.a, 1)
|
||||
self.assertEqual(cfg.b, obj['b'])
|
||||
|
||||
def test_py(self):
|
||||
config_file = 'configs/examples/config.py'
|
||||
config_file = 'configs/examples/configuration.py'
|
||||
cfg = Config.from_file(config_file)
|
||||
self.assertEqual(cfg.a, 1)
|
||||
self.assertEqual(cfg.b, obj['b'])
|
||||
|
||||
def test_dump(self):
|
||||
config_file = 'configs/examples/config.py'
|
||||
config_file = 'configs/examples/configuration.py'
|
||||
cfg = Config.from_file(config_file)
|
||||
self.assertEqual(cfg.a, 1)
|
||||
self.assertEqual(cfg.b, obj['b'])
|
||||
@@ -53,7 +50,7 @@ class ConfigTest(unittest.TestCase):
|
||||
self.assertEqual(yaml_str, infile.read())
|
||||
|
||||
def test_to_dict(self):
|
||||
config_file = 'configs/examples/config.json'
|
||||
config_file = 'configs/examples/configuration.json'
|
||||
cfg = Config.from_file(config_file)
|
||||
d = cfg.to_dict()
|
||||
print(d)
|
||||
|
||||
Reference in New Issue
Block a user