space intent and modeling(generation) are ready

2025-12-24 03:59:23 +01:00 · 2022-06-20 16:39:21 +08:00
parent c99f3a9b8c 99fb503695
commit d0b33eade8
74 changed files with 3689 additions and 267 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,3 @@
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@@ -104,7 +104,6 @@ venv.bak/
 # mypy
 .mypy_cache/

-data
 .vscode
 .idea

--- a/Makefile.docker
+++ b/Makefile.docker
@@ -0,0 +1,67 @@
+DOCKER_REGISTRY           = registry.cn-shanghai.aliyuncs.com
+DOCKER_ORG                = modelscope
+DOCKER_IMAGE              = modelscope
+DOCKER_FULL_NAME          = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
+
+# CUDA_VERSION              = 11.3
+# CUDNN_VERSION             = 8
+BASE_RUNTIME              = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
+# BASE_DEVEL                = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
+BASE_DEVEL                = pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
+
+
+MODELSCOPE_VERSION           = $(shell git describe --tags --always)
+
+# Can be either official / dev
+BUILD_TYPE                = dev
+BUILD_PROGRESS            = auto
+BUILD_ARGS                = --build-arg BASE_IMAGE=$(BASE_IMAGE)
+
+EXTRA_DOCKER_BUILD_FLAGS ?= --network=host
+# DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
+# 							docker build \
+# 								--progress=$(BUILD_PROGRESS) \
+# 								$(EXTRA_DOCKER_BUILD_FLAGS) \
+# 								--target $(BUILD_TYPE) \
+# 								-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
+# 								$(BUILD_ARGS) \
+#								-f docker/pytorch.dockerfile .
+DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
+							docker build \
+								$(EXTRA_DOCKER_BUILD_FLAGS) \
+								-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
+								$(BUILD_ARGS)  \
+								-f docker/pytorch.dockerfile .
+DOCKER_PUSH               = docker push $(DOCKER_FULL_NAME):$(DOCKER_TAG)
+
+.PHONY: all
+all: devel-image
+
+.PHONY: devel-image
+devel-image: BASE_IMAGE := $(BASE_DEVEL)
+devel-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
+devel-image:
+	$(DOCKER_BUILD)
+
+.PHONY: devel-push
+devel-push: BASE_IMAGE := $(BASE_DEVEL)
+devel-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
+devel-push:
+	$(DOCKER_PUSH)
+
+.PHONY: runtime-image
+runtime-image: BASE_IMAGE := $(BASE_RUNTIME)
+runtime-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
+runtime-image:
+	$(DOCKER_BUILD)
+	docker tag $(DOCKER_FULL_NAME):$(DOCKER_TAG) $(DOCKER_FULL_NAME):latest
+
+.PHONY: runtime-push
+runtime-push: BASE_IMAGE := $(BASE_RUNTIME)
+runtime-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
+runtime-push:
+	$(DOCKER_PUSH)
+
+.PHONY: clean
+clean:
+	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
--- a/configs/examples/configuration.json
+++ b/configs/examples/configuration.json
--- a/configs/examples/configuration.py
+++ b/configs/examples/configuration.py
--- a/configs/examples/configuration.yaml
+++ b/configs/examples/configuration.yaml
--- a/data/test/images/image1.jpg
+++ b/data/test/images/image1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78094cc48fbcfd9b6d321fe13619ecc72b65e006fc1b4c4458409ade9979486d
+size 129862
--- a/data/test/images/image_matting.png
+++ b/data/test/images/image_matting.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
+size 603621
--- a/docker/.dockerignore
+++ b/docker/.dockerignore
@@ -0,0 +1,4 @@
+*.sh
+*.md
+*.dockerfile
+*.zip
--- a/docker/pytorch.dockerfile
+++ b/docker/pytorch.dockerfile
@@ -0,0 +1,53 @@
+# syntax = docker/dockerfile:experimental
+#
+# NOTE: To build this you will need a docker version > 18.06 with
+#       experimental enabled and DOCKER_BUILDKIT=1
+#
+#       If you do not use buildkit you are not going to have a good time
+#
+#       For reference:
+#           https://docs.docker.com/develop/develop-images/build_enhancements/
+
+# ARG BASE_IMAGE=reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
+# FROM ${BASE_IMAGE} as dev-base
+
+# FROM reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 as dev-base
+FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
+# FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
+# config pip source
+RUN mkdir /root/.pip
+COPY docker/rcfiles/pip.conf.tsinghua  /root/.pip/pip.conf
+COPY docker/rcfiles/sources.list.aliyun /etc/apt/sources.list
+
+# Install essential Ubuntu packages
+RUN apt-get update &&\
+    apt-get install -y software-properties-common \
+    build-essential \
+    git \
+    wget \
+    vim \
+    curl \
+    zip \
+    zlib1g-dev \
+    unzip \
+    pkg-config
+
+# install modelscope and its python env
+WORKDIR /opt/modelscope
+COPY . .
+RUN pip install -r requirements.txt
+# RUN --mount=type=cache,target=/opt/ccache \
+#     python setup.py install
+
+# opencv-python-headless conflict with opencv-python installed
+RUN python setup.py install \
+    && pip uninstall -y opencv-python-headless
+
+# prepare modelscope libs
+COPY docker/scripts/install_libs.sh /tmp/
+RUN bash /tmp/install_libs.sh && \
+    rm -rf /tmp/install_libs.sh
+
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/modelscope/lib64
+
+WORKDIR /workspace
--- a/docker/rcfiles/pip.conf.tsinghua
+++ b/docker/rcfiles/pip.conf.tsinghua
@@ -0,0 +1,2 @@
+[global]
+index-url=https://pypi.tuna.tsinghua.edu.cn/simple
--- a/docker/rcfiles/sources.list.aliyun
+++ b/docker/rcfiles/sources.list.aliyun
@@ -0,0 +1,25 @@
+deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted
+# deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted
+
+deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted
+# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted
+
+deb http://mirrors.aliyun.com/ubuntu/ bionic universe
+# deb-src http://mirrors.aliyun.com/ubuntu/ bionic universe
+deb http://mirrors.aliyun.com/ubuntu/ bionic-updates universe
+# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates universe
+
+deb http://mirrors.aliyun.com/ubuntu/ bionic multiverse
+# deb-src http://mirrors.aliyun.com/ubuntu/ bionic multiverse
+deb http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse
+# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse
+
+deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
+# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
+
+deb http://mirrors.aliyun.com/ubuntu bionic-security main restricted
+# deb-src http://mirrors.aliyun.com/ubuntu bionic-security main restricted
+deb http://mirrors.aliyun.com/ubuntu bionic-security universe
+# deb-src http://mirrors.aliyun.com/ubuntu bionic-security universe
+deb http://mirrors.aliyun.com/ubuntu bionic-security multiverse
+# deb-src http://mirrors.aliyun.com/ubuntu bionic-security multiverse
--- a/docker/rcfiles/user.vimrc
+++ b/docker/rcfiles/user.vimrc
@@ -0,0 +1,10 @@
+set nocompatible
+set encoding=utf-8
+set hlsearch
+set smartindent
+set ruler
+set number
+set ts=2
+set sw=2
+set expandtab
+autocmd FileType make setlocal noexpandtab
--- a/docker/scripts/install_libs.sh
+++ b/docker/scripts/install_libs.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -eo pipefail
+
+ModelScopeLib=/usr/local/modelscope/lib64
+
+if [ ! -d /usr/local/modelscope ]; then
+    mkdir -p $ModelScopeLib
+fi
+
+# audio libs
+wget "http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/libs/audio/libmitaec_pyio.so" -O ${ModelScopeLib}/libmitaec_pyio.so
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -76,7 +76,7 @@ exclude_patterns = ['build', 'Thumbs.db', '.DS_Store']
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = 'sphinx_book_theme'
 html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 html_theme_options = {}

--- a/docs/source/develop.md
+++ b/docs/source/develop.md
@@ -34,13 +34,111 @@ make linter
 ```

 ## 2. Test
-### 2.1 Unit test
+
+### 2.1 Test level
+
+There are mainly three test levels:
+
+* level 0: tests for basic interface and function of framework, such as `tests/trainers/test_trainer_base.py`
+* level 1: important functional test which test end2end workflow, such as `tests/pipelines/test_image_matting.py`
+* level 2: scenario tests for all the implemented modules such as model, pipeline in different algorithm filed.
+
+Default test level is 0, which will only run those cases of level 0, you can set test level
+via environment variable `TEST_LEVEL`. For more details, you can refer to [test-doc](https://alidocs.dingtalk.com/i/nodes/mdvQnONayjBJKLXy1Bp38PY2MeXzp5o0?dontjump=true&nav=spaces&navQuery=spaceId%3Dnb9XJNlZxbgrOXyA)
+
+
 ```bash
+# run all tests
+TEST_LEVEL=2 make test
+
+# run important functional tests
+TEST_LEVEL=1 make test
+
+# run core UT and basic functional tests
 make test
 ```

-### 2.2 Test data
-TODO
+When writing test cases, you should assign a test level for your test case using
+following code. If left default, the test level will be 0, it will run in each
+test stage.
+
+File test_module.py
+```python
+from modelscope.utils.test_utils import test_level
+
+class ImageCartoonTest(unittest.TestCase):
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        pass
+```
+
+### 2.2 Run tests
+
+1. Run your own single test case to test your self-implemented function. You can run your
+test file directly, if it fails to run, pls check if variable `TEST_LEVEL`
+exists in the environment and unset it.
+```bash
+python tests/path/to/your_test.py
+```
+
+2. Remember to run core tests in local environment before start a codereview, by default it will
+only run test cases with level 0.
+```bash
+make tests
+```
+
+3. After you start a code review, ci tests will be triggered which will run test cases with level 1
+
+4. Daily regression tests will run all cases at 0 am each day using master branch.
+
+### 2.3 Test data storage
+
+As we need a lot of data for testing, including images, videos, models. We use git lfs
+to store those large files.
+
+1. install git-lfs
+for mac
+```bash
+brew install git-lfs
+git lfs install
+```
+
+for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
+```bash
+wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
+sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
+git lfs install
+```
+
+for ubuntu
+```bash
+curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
+sudo apt-get install git-lfs
+git lfs install
+```
+
+2. track your data type using git lfs, for example, to track png files
+```bash
+git lfs track "*.png"
+```
+
+3. add your test files to `data/test/` folder, you can make directories if you need.
+```bash
+git add data/test/test.png
+```
+
+4. commit your test data to remote branch
+```bash
+git commit -m "xxx"
+```
+
+To pull data from remote repo, just as the same way you pull git files.
+```bash
+git pull origin branch_name
+```
+
+
+

 ## Code Review

@@ -93,3 +191,22 @@ TODO
 ```bash
 make whl
 ```
+
+## Build docker
+
+build develop docker
+```bash
+sudo make -f Makefile.docker devel-image
+```
+
+push develop docker, passwd pls ask wenmeng.zwm
+```bash
+sudo docker login --username=mass_test@test.aliyunid.com registry.cn-shanghai.aliyuncs.com
+Password:
+sudo make -f Makefile.docker devel-push
+```
+
+To build runtime image, just replace `devel` with `runtime` in the upper commands.
+```bash
+udo make -f Makefile.docker runtime-image runtime-push
+```
--- a/modelscope/models/init.py
+++ b/modelscope/models/init.py
@@ -2,4 +2,4 @@

 from .base import Model
 from .builder import MODELS, build_model
-from .nlp import BertForSequenceClassification
+from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity
--- a/modelscope/models/audio/init.py
+++ b/modelscope/models/audio/init.py
--- a/modelscope/models/audio/layers/init.py
+++ b/modelscope/models/audio/layers/init.py
--- a/modelscope/models/audio/layers/activations.py
+++ b/modelscope/models/audio/layers/activations.py
@@ -0,0 +1,60 @@
+import torch.nn as nn
+
+from .layer_base import LayerBase
+
+
+class RectifiedLinear(LayerBase):
+
+    def __init__(self, input_dim, output_dim):
+        super(RectifiedLinear, self).__init__()
+        self.dim = input_dim
+        self.relu = nn.ReLU()
+
+    def forward(self, input):
+        return self.relu(input)
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
+        return re_str
+
+    def load_kaldi_nnet(self, instr):
+        return instr
+
+
+class LogSoftmax(LayerBase):
+
+    def __init__(self, input_dim, output_dim):
+        super(LogSoftmax, self).__init__()
+        self.dim = input_dim
+        self.ls = nn.LogSoftmax()
+
+    def forward(self, input):
+        return self.ls(input)
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<Softmax> %d %d\n' % (self.dim, self.dim)
+        return re_str
+
+    def load_kaldi_nnet(self, instr):
+        return instr
+
+
+class Sigmoid(LayerBase):
+
+    def __init__(self, input_dim, output_dim):
+        super(Sigmoid, self).__init__()
+        self.dim = input_dim
+        self.sig = nn.Sigmoid()
+
+    def forward(self, input):
+        return self.sig(input)
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<Sigmoid> %d %d\n' % (self.dim, self.dim)
+        return re_str
+
+    def load_kaldi_nnet(self, instr):
+        return instr
--- a/modelscope/models/audio/layers/affine_transform.py
+++ b/modelscope/models/audio/layers/affine_transform.py
@@ -0,0 +1,78 @@
+import numpy as np
+import torch as th
+import torch.nn as nn
+
+from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
+                         to_kaldi_matrix)
+
+
+class AffineTransform(LayerBase):
+
+    def __init__(self, input_dim, output_dim):
+        super(AffineTransform, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.linear = nn.Linear(input_dim, output_dim)
+
+    def forward(self, input):
+        return self.linear(input)
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
+                                                 self.input_dim)
+        re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'
+        linear_weights = self.state_dict()['linear.weight']
+        x = linear_weights.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        linear_bias = self.state_dict()['linear.bias']
+        x = linear_bias.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        return re_str
+
+    def to_raw_nnet(self, fid):
+        linear_weights = self.state_dict()['linear.weight']
+        x = linear_weights.squeeze().numpy()
+        x.tofile(fid)
+
+        linear_bias = self.state_dict()['linear.bias']
+        x = linear_bias.squeeze().numpy()
+        x.tofile(fid)
+
+    def load_kaldi_nnet(self, instr):
+        output = expect_token_number(
+            instr,
+            '<LearnRateCoef>',
+        )
+        if output is None:
+            raise Exception('AffineTransform format error for <LearnRateCoef>')
+        instr, lr = output
+
+        output = expect_token_number(instr, '<BiasLearnRateCoef>')
+        if output is None:
+            raise Exception(
+                'AffineTransform format error for <BiasLearnRateCoef>')
+        instr, lr = output
+
+        output = expect_token_number(instr, '<MaxNorm>')
+        if output is None:
+            raise Exception('AffineTransform format error for <MaxNorm>')
+        instr, lr = output
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('AffineTransform format error for parsing matrix')
+        instr, mat = output
+
+        print(mat.shape)
+        self.linear.weight = th.nn.Parameter(
+            th.from_numpy(mat).type(th.FloatTensor))
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('AffineTransform format error for parsing matrix')
+        instr, mat = output
+        mat = np.squeeze(mat)
+        self.linear.bias = th.nn.Parameter(
+            th.from_numpy(mat).type(th.FloatTensor))
+        return instr
--- a/modelscope/models/audio/layers/deep_fsmn.py
+++ b/modelscope/models/audio/layers/deep_fsmn.py
@@ -0,0 +1,178 @@
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
+                         to_kaldi_matrix)
+
+
+class DeepFsmn(LayerBase):
+
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 lorder=None,
+                 rorder=None,
+                 hidden_size=None,
+                 layer_norm=False,
+                 dropout=0):
+        super(DeepFsmn, self).__init__()
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+
+        if lorder is None:
+            return
+
+        self.lorder = lorder
+        self.rorder = rorder
+        self.hidden_size = hidden_size
+        self.layer_norm = layer_norm
+
+        self.linear = nn.Linear(input_dim, hidden_size)
+        self.norm = nn.LayerNorm(hidden_size)
+        self.drop1 = nn.Dropout(p=dropout)
+        self.drop2 = nn.Dropout(p=dropout)
+        self.project = nn.Linear(hidden_size, output_dim, bias=False)
+
+        self.conv1 = nn.Conv2d(
+            output_dim,
+            output_dim, [lorder, 1], [1, 1],
+            groups=output_dim,
+            bias=False)
+        self.conv2 = nn.Conv2d(
+            output_dim,
+            output_dim, [rorder, 1], [1, 1],
+            groups=output_dim,
+            bias=False)
+
+    def forward(self, input):
+
+        f1 = F.relu(self.linear(input))
+
+        f1 = self.drop1(f1)
+        if self.layer_norm:
+            f1 = self.norm(f1)
+
+        p1 = self.project(f1)
+
+        x = th.unsqueeze(p1, 1)
+
+        x_per = x.permute(0, 3, 2, 1)
+
+        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
+        yr = F.pad(x_per, [0, 0, 0, self.rorder])
+        yr = yr[:, :, 1:, :]
+
+        out = x_per + self.conv1(y) + self.conv2(yr)
+        out = self.drop2(out)
+
+        out1 = out.permute(0, 3, 2, 1)
+
+        return input + out1.squeeze()
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<UniDeepFsmn> %d %d\n'\
+                  % (self.output_dim, self.input_dim)
+        re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n'\
+                  % (1, self.hidden_size, self.lorder, 1)
+        lfiters = self.state_dict()['conv1.weight']
+        x = np.flipud(lfiters.squeeze().numpy().T)
+        re_str += to_kaldi_matrix(x)
+        proj_weights = self.state_dict()['project.weight']
+        x = proj_weights.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        linear_weights = self.state_dict()['linear.weight']
+        x = linear_weights.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        linear_bias = self.state_dict()['linear.bias']
+        x = linear_bias.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        return re_str
+
+    def load_kaldi_nnet(self, instr):
+        output = expect_token_number(
+            instr,
+            '<LearnRateCoef>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
+        instr, lr = output
+
+        output = expect_token_number(
+            instr,
+            '<HidSize>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <HidSize>')
+        instr, hiddensize = output
+        self.hidden_size = int(hiddensize)
+
+        output = expect_token_number(
+            instr,
+            '<LOrder>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <LOrder>')
+        instr, lorder = output
+        self.lorder = int(lorder)
+
+        output = expect_token_number(
+            instr,
+            '<LStride>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <LStride>')
+        instr, lstride = output
+        self.lstride = lstride
+
+        output = expect_token_number(
+            instr,
+            '<MaxNorm>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <MaxNorm>')
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('UniDeepFsmn format error for parsing matrix')
+        instr, mat = output
+        mat1 = np.fliplr(mat.T).copy()
+        self.conv1 = nn.Conv2d(
+            self.output_dim,
+            self.output_dim, [self.lorder, 1], [1, 1],
+            groups=self.output_dim,
+            bias=False)
+        mat_th = th.from_numpy(mat1).type(th.FloatTensor)
+        mat_th = mat_th.unsqueeze(1)
+        mat_th = mat_th.unsqueeze(3)
+        self.conv1.weight = th.nn.Parameter(mat_th)
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('UniDeepFsmn format error for parsing matrix')
+        instr, mat = output
+
+        self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
+        self.linear = nn.Linear(self.input_dim, self.hidden_size)
+
+        self.project.weight = th.nn.Parameter(
+            th.from_numpy(mat).type(th.FloatTensor))
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('UniDeepFsmn format error for parsing matrix')
+        instr, mat = output
+        self.linear.weight = th.nn.Parameter(
+            th.from_numpy(mat).type(th.FloatTensor))
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('UniDeepFsmn format error for parsing matrix')
+        instr, mat = output
+        self.linear.bias = th.nn.Parameter(
+            th.from_numpy(mat).type(th.FloatTensor))
+
+        return instr
--- a/modelscope/models/audio/layers/layer_base.py
+++ b/modelscope/models/audio/layers/layer_base.py
@@ -0,0 +1,50 @@
+import abc
+import re
+
+import numpy as np
+import torch.nn as nn
+
+
+def expect_token_number(instr, token):
+    first_token = re.match(r'^\s*' + token, instr)
+    if first_token is None:
+        return None
+    instr = instr[first_token.end():]
+    lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr)
+    if lr is None:
+        return None
+    return instr[lr.end():], lr.groups()[0]
+
+
+def expect_kaldi_matrix(instr):
+    pos2 = instr.find('[', 0)
+    pos3 = instr.find(']', pos2)
+    mat = []
+    for stt in instr[pos2 + 1:pos3].split('\n'):
+        tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ')
+        if tmp_mat.size > 0:
+            mat.append(tmp_mat)
+    return instr[pos3 + 1:], np.array(mat)
+
+
+def to_kaldi_matrix(np_mat):
+    """
+    function that transform as str numpy mat to standard kaldi str matrix
+    :param np_mat: numpy mat
+    :return: str
+    """
+    np.set_printoptions(threshold=np.inf, linewidth=np.nan, suppress=True)
+    out_str = str(np_mat)
+    out_str = out_str.replace('[', '')
+    out_str = out_str.replace(']', '')
+    return '[ %s ]\n' % out_str
+
+
+class LayerBase(nn.Module, metaclass=abc.ABCMeta):
+
+    def __init__(self):
+        super(LayerBase, self).__init__()
+
+    @abc.abstractmethod
+    def to_kaldi_nnet(self):
+        pass
--- a/modelscope/models/audio/layers/uni_deep_fsmn.py
+++ b/modelscope/models/audio/layers/uni_deep_fsmn.py
@@ -0,0 +1,482 @@
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
+                         to_kaldi_matrix)
+
+
+class SepConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 filters,
+                 out_channels,
+                 kernel_size=(5, 2),
+                 dilation=(1, 1)):
+        """ :param kernel_size (time, frequency)
+
+        """
+        super(SepConv, self).__init__()
+        # depthwise + pointwise
+        self.dconv = nn.Conv2d(
+            in_channels,
+            in_channels * filters,
+            kernel_size,
+            dilation=dilation,
+            groups=in_channels)
+        self.pconv = nn.Conv2d(
+            in_channels * filters, out_channels, kernel_size=1)
+        self.padding = dilation[0] * (kernel_size[0] - 1)
+
+    def forward(self, input):
+        ''' input: [B, C, T, F]
+        '''
+        x = F.pad(input, [0, 0, self.padding, 0])
+        x = self.dconv(x)
+        x = self.pconv(x)
+        return x
+
+
+class Conv2d(nn.Module):
+
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 lorder=20,
+                 rorder=0,
+                 groups=1,
+                 bias=False,
+                 skip_connect=True):
+        super(Conv2d, self).__init__()
+        self.lorder = lorder
+        self.conv = nn.Conv2d(
+            input_dim, output_dim, [lorder, 1], groups=groups, bias=bias)
+        self.rorder = rorder
+        if self.rorder:
+            self.conv2 = nn.Conv2d(
+                input_dim, output_dim, [rorder, 1], groups=groups, bias=bias)
+        self.skip_connect = skip_connect
+
+    def forward(self, input):
+        # [B, 1, T, F]
+        x = th.unsqueeze(input, 1)
+        # [B, F, T, 1]
+        x_per = x.permute(0, 3, 2, 1)
+        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
+        out = self.conv(y)
+        if self.rorder:
+            yr = F.pad(x_per, [0, 0, 0, self.rorder])
+            yr = yr[:, :, 1:, :]
+            out += self.conv2(yr)
+        out = out.permute(0, 3, 2, 1).squeeze(1)
+        if self.skip_connect:
+            out = out + input
+        return out
+
+
+class SelfAttLayer(nn.Module):
+
+    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
+        super(SelfAttLayer, self).__init__()
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+
+        if lorder is None:
+            return
+
+        self.lorder = lorder
+        self.hidden_size = hidden_size
+
+        self.linear = nn.Linear(input_dim, hidden_size)
+
+        self.project = nn.Linear(hidden_size, output_dim, bias=False)
+
+        self.att = nn.Linear(input_dim, lorder, bias=False)
+
+    def forward(self, input):
+
+        f1 = F.relu(self.linear(input))
+
+        p1 = self.project(f1)
+
+        x = th.unsqueeze(p1, 1)
+
+        x_per = x.permute(0, 3, 2, 1)
+
+        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
+
+        # z [B, F, T, lorder]
+        z = x_per
+        for i in range(1, self.lorder):
+            z = th.cat([z, y[:, :, self.lorder - 1 - i:-i, :]], axis=-1)
+
+        # [B, T, lorder]
+        att = F.softmax(self.att(input), dim=-1)
+        att = th.unsqueeze(att, 1)
+        z = th.sum(z * att, axis=-1)
+
+        out1 = z.permute(0, 2, 1)
+
+        return input + out1
+
+
+class TFFsmn(nn.Module):
+
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 lorder=None,
+                 hidden_size=None,
+                 dilation=1,
+                 layer_norm=False,
+                 dropout=0,
+                 skip_connect=True):
+        super(TFFsmn, self).__init__()
+
+        self.skip_connect = skip_connect
+
+        self.linear = nn.Linear(input_dim, hidden_size)
+        self.norm = nn.Identity()
+        if layer_norm:
+            self.norm = nn.LayerNorm(input_dim)
+        self.act = nn.ReLU()
+        self.project = nn.Linear(hidden_size, output_dim, bias=False)
+
+        self.conv1 = nn.Conv2d(
+            output_dim,
+            output_dim, [lorder, 1],
+            dilation=[dilation, 1],
+            groups=output_dim,
+            bias=False)
+        self.padding_left = dilation * (lorder - 1)
+        dorder = 5
+        self.conv2 = nn.Conv2d(1, 1, [dorder, 1], bias=False)
+        self.padding_freq = dorder - 1
+
+    def forward(self, input):
+        return self.compute1(input)
+
+    def compute1(self, input):
+        ''' linear-dconv-relu(norm)-linear-dconv
+        '''
+        x = self.linear(input)
+        # [B, 1, F, T]
+        x = th.unsqueeze(x, 1).permute(0, 1, 3, 2)
+        z = F.pad(x, [0, 0, self.padding_freq, 0])
+        z = self.conv2(z) + x
+        x = z.permute(0, 3, 2, 1).squeeze(-1)
+        x = self.act(x)
+        x = self.norm(x)
+        x = self.project(x)
+        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
+        # [B, F, T+lorder-1, 1]
+        y = F.pad(x, [0, 0, self.padding_left, 0])
+        out = self.conv1(y)
+        if self.skip_connect:
+            out = out + x
+        out = out.permute(0, 3, 2, 1).squeeze()
+
+        return input + out
+
+
+class CNNFsmn(nn.Module):
+    ''' use cnn to reduce parameters
+    '''
+
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 lorder=None,
+                 hidden_size=None,
+                 dilation=1,
+                 layer_norm=False,
+                 dropout=0,
+                 skip_connect=True):
+        super(CNNFsmn, self).__init__()
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.skip_connect = skip_connect
+
+        if lorder is None:
+            return
+
+        self.lorder = lorder
+        self.hidden_size = hidden_size
+
+        self.linear = nn.Linear(input_dim, hidden_size)
+        self.act = nn.ReLU()
+        kernel_size = (3, 8)
+        stride = (1, 4)
+        self.conv = nn.Sequential(
+            nn.ConstantPad2d((stride[1], 0, kernel_size[0] - 1, 0), 0),
+            nn.Conv2d(1, stride[1], kernel_size=kernel_size, stride=stride))
+
+        self.dconv = nn.Conv2d(
+            output_dim,
+            output_dim, [lorder, 1],
+            dilation=[dilation, 1],
+            groups=output_dim,
+            bias=False)
+        self.padding_left = dilation * (lorder - 1)
+
+    def forward(self, input):
+        return self.compute2(input)
+
+    def compute1(self, input):
+        ''' linear-relu(norm)-conv2d-relu?-dconv
+        '''
+        # [B, T, F]
+        x = self.linear(input)
+        x = self.act(x)
+        x = th.unsqueeze(x, 1)
+        x = self.conv(x)
+        # [B, C, T, F] -> [B, 1, T, F]
+        b, c, t, f = x.shape
+        x = x.view([b, 1, t, -1])
+        x = x.permute(0, 3, 2, 1)
+        # [B, F, T+lorder-1, 1]
+        y = F.pad(x, [0, 0, self.padding_left, 0])
+        out = self.dconv(y)
+        if self.skip_connect:
+            out = out + x
+        out = out.permute(0, 3, 2, 1).squeeze()
+        return input + out
+
+    def compute2(self, input):
+        ''' conv2d-relu-linear-relu?-dconv
+        '''
+        x = th.unsqueeze(input, 1)
+        x = self.conv(x)
+        x = self.act(x)
+        # [B, C, T, F] -> [B, T, F]
+        b, c, t, f = x.shape
+        x = x.view([b, t, -1])
+        x = self.linear(x)
+        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
+        y = F.pad(x, [0, 0, self.padding_left, 0])
+        out = self.dconv(y)
+        if self.skip_connect:
+            out = out + x
+        out = out.permute(0, 3, 2, 1).squeeze()
+        return input + out
+
+
+class UniDeepFsmn(LayerBase):
+
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 lorder=None,
+                 hidden_size=None,
+                 dilation=1,
+                 layer_norm=False,
+                 dropout=0,
+                 skip_connect=True):
+        super(UniDeepFsmn, self).__init__()
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.skip_connect = skip_connect
+
+        if lorder is None:
+            return
+
+        self.lorder = lorder
+        self.hidden_size = hidden_size
+
+        self.linear = nn.Linear(input_dim, hidden_size)
+        self.norm = nn.Identity()
+        if layer_norm:
+            self.norm = nn.LayerNorm(input_dim)
+        self.act = nn.ReLU()
+        self.project = nn.Linear(hidden_size, output_dim, bias=False)
+
+        self.conv1 = nn.Conv2d(
+            output_dim,
+            output_dim, [lorder, 1],
+            dilation=[dilation, 1],
+            groups=output_dim,
+            bias=False)
+        self.padding_left = dilation * (lorder - 1)
+
+    def forward(self, input):
+        return self.compute1(input)
+
+    def compute1(self, input):
+        ''' linear-relu(norm)-linear-dconv
+        '''
+        # [B, T, F]
+        x = self.linear(input)
+        x = self.act(x)
+        x = self.norm(x)
+        x = self.project(x)
+        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
+        # [B, F, T+lorder-1, 1]
+        y = F.pad(x, [0, 0, self.padding_left, 0])
+        out = self.conv1(y)
+        if self.skip_connect:
+            out = out + x
+        out = out.permute(0, 3, 2, 1).squeeze()
+
+        return input + out
+
+    def compute2(self, input):
+        ''' linear-dconv-linear-relu(norm)
+        '''
+        x = self.project(input)
+        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
+        y = F.pad(x, [0, 0, self.padding_left, 0])
+        out = self.conv1(y)
+        if self.skip_connect:
+            out = out + x
+        out = out.permute(0, 3, 2, 1).squeeze()
+        x = self.linear(out)
+        x = self.act(x)
+        x = self.norm(x)
+
+        return input + x
+
+    def compute3(self, input):
+        ''' dconv-linear-relu(norm)-linear
+        '''
+        x = th.unsqueeze(input, 1).permute(0, 3, 2, 1)
+        y = F.pad(x, [0, 0, self.padding_left, 0])
+        out = self.conv1(y)
+        if self.skip_connect:
+            out = out + x
+        out = out.permute(0, 3, 2, 1).squeeze()
+        x = self.linear(out)
+        x = self.act(x)
+        x = self.norm(x)
+        x = self.project(x)
+
+        return input + x
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<UniDeepFsmn> %d %d\n' \
+                  % (self.output_dim, self.input_dim)
+        re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n' \
+                  % (1, self.hidden_size, self.lorder, 1)
+        lfiters = self.state_dict()['conv1.weight']
+        x = np.flipud(lfiters.squeeze().numpy().T)
+        re_str += to_kaldi_matrix(x)
+        proj_weights = self.state_dict()['project.weight']
+        x = proj_weights.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        linear_weights = self.state_dict()['linear.weight']
+        x = linear_weights.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        linear_bias = self.state_dict()['linear.bias']
+        x = linear_bias.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        return re_str
+
+    def to_raw_nnet(self, fid):
+        lfiters = self.state_dict()['conv1.weight']
+        x = np.flipud(lfiters.squeeze().numpy().T)
+        x.tofile(fid)
+
+        proj_weights = self.state_dict()['project.weight']
+        x = proj_weights.squeeze().numpy()
+        x.tofile(fid)
+
+        linear_weights = self.state_dict()['linear.weight']
+        x = linear_weights.squeeze().numpy()
+        x.tofile(fid)
+
+        linear_bias = self.state_dict()['linear.bias']
+        x = linear_bias.squeeze().numpy()
+        x.tofile(fid)
+
+    def load_kaldi_nnet(self, instr):
+        output = expect_token_number(
+            instr,
+            '<LearnRateCoef>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
+        instr, lr = output
+
+        output = expect_token_number(
+            instr,
+            '<HidSize>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <HidSize>')
+        instr, hiddensize = output
+        self.hidden_size = int(hiddensize)
+
+        output = expect_token_number(
+            instr,
+            '<LOrder>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <LOrder>')
+        instr, lorder = output
+        self.lorder = int(lorder)
+
+        output = expect_token_number(
+            instr,
+            '<LStride>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <LStride>')
+        instr, lstride = output
+        self.lstride = lstride
+
+        output = expect_token_number(
+            instr,
+            '<MaxNorm>',
+        )
+        if output is None:
+            raise Exception('UniDeepFsmn format error for <MaxNorm>')
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('UniDeepFsmn format error for parsing matrix')
+        instr, mat = output
+        mat1 = np.fliplr(mat.T).copy()
+
+        self.conv1 = nn.Conv2d(
+            self.output_dim,
+            self.output_dim, [self.lorder, 1], [1, 1],
+            groups=self.output_dim,
+            bias=False)
+
+        mat_th = th.from_numpy(mat1).type(th.FloatTensor)
+        mat_th = mat_th.unsqueeze(1)
+        mat_th = mat_th.unsqueeze(3)
+        self.conv1.weight = th.nn.Parameter(mat_th)
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('UniDeepFsmn format error for parsing matrix')
+        instr, mat = output
+
+        self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
+        self.linear = nn.Linear(self.input_dim, self.hidden_size)
+
+        self.project.weight = th.nn.Parameter(
+            th.from_numpy(mat).type(th.FloatTensor))
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('UniDeepFsmn format error for parsing matrix')
+        instr, mat = output
+        self.linear.weight = th.nn.Parameter(
+            th.from_numpy(mat).type(th.FloatTensor))
+
+        output = expect_kaldi_matrix(instr)
+        if output is None:
+            raise Exception('UniDeepFsmn format error for parsing matrix')
+        instr, mat = output
+        mat = np.squeeze(mat)
+        self.linear.bias = th.nn.Parameter(
+            th.from_numpy(mat).type(th.FloatTensor))
+
+        return instr
--- a/modelscope/models/audio/network/init.py
+++ b/modelscope/models/audio/network/init.py
--- a/modelscope/models/audio/network/loss.py
+++ b/modelscope/models/audio/network/loss.py
@@ -0,0 +1,394 @@
+import torch
+import torch.nn.functional as F
+
+from .modulation_loss import (GaborSTRFConv, MelScale,
+                              ModulationDomainLossModule)
+
+EPS = 1e-8
+
+
+def compute_mask(mixed_spec, clean_spec, mask_type='psmiam', clip=1):
+    '''
+        stft: (batch, ..., 2) or complex(batch, ...)
+        y = x + n
+    '''
+    if torch.is_complex(mixed_spec):
+        yr, yi = mixed_spec.real, mixed_spec.imag
+    else:
+        yr, yi = mixed_spec[..., 0], mixed_spec[..., 1]
+    if torch.is_complex(clean_spec):
+        xr, xi = clean_spec.real, clean_spec.imag
+    else:
+        xr, xi = clean_spec[..., 0], clean_spec[..., 1]
+
+    if mask_type == 'iam':
+        ymag = torch.sqrt(yr**2 + yi**2)
+        xmag = torch.sqrt(xr**2 + xi**2)
+        iam = xmag / (ymag + EPS)
+        return torch.clamp(iam, 0, 1)
+
+    elif mask_type == 'psm':
+        ypow = yr**2 + yi**2
+        psm = (xr * yr + xi * yi) / (ypow + EPS)
+        return torch.clamp(psm, 0, 1)
+
+    elif mask_type == 'psmiam':
+        ypow = yr**2 + yi**2
+        psm = (xr * yr + xi * yi) / (ypow + EPS)
+        ymag = torch.sqrt(yr**2 + yi**2)
+        xmag = torch.sqrt(xr**2 + xi**2)
+        iam = xmag / (ymag + EPS)
+        psmiam = psm * iam
+        return torch.clamp(psmiam, 0, 1)
+
+    elif mask_type == 'crm':
+        ypow = yr**2 + yi**2
+        mr = (xr * yr + xi * yi) / (ypow + EPS)
+        mi = (xi * yr - xr * yi) / (ypow + EPS)
+        mr = torch.clamp(mr, -clip, clip)
+        mi = torch.clamp(mi, -clip, clip)
+        return mr, mi
+
+
+def energy_vad(spec,
+               thdhigh=320 * 600 * 600 * 2,
+               thdlow=320 * 300 * 300 * 2,
+               int16=True):
+    '''
+        energy based vad should be accurate enough
+        spec: (batch, bins, frames, 2)
+        returns (batch, frames)
+    '''
+    energy = torch.sum(spec[..., 0]**2 + spec[..., 1]**2, dim=1)
+    vad = energy > thdhigh
+    idx = torch.logical_and(vad == 0, energy > thdlow)
+    vad[idx] = 0.5
+    return vad
+
+
+def modulation_loss_init(n_fft):
+    gabor_strf_parameters = torch.load(
+        './network/gabor_strf_parameters.pt')['state_dict']
+    gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60)
+    gabor_modulation_kernels.load_state_dict(gabor_strf_parameters)
+
+    modulation_loss_module = ModulationDomainLossModule(
+        gabor_modulation_kernels.eval())
+    for param in modulation_loss_module.parameters():
+        param.requires_grad = False
+
+    stft2mel = MelScale(
+        n_mels=80, sample_rate=16000, n_stft=n_fft // 2 + 1).cuda()
+
+    return modulation_loss_module, stft2mel
+
+
+def mask_loss_function(
+        loss_func='psm_loss',
+        loss_type='mse',  # ['mse', 'mae', 'comb']
+        mask_type='psmiam',
+        use_mod_loss=False,
+        use_wav2vec_loss=False,
+        n_fft=640,
+        hop_length=320,
+        EPS=1e-8,
+        weight=None):
+    if weight is not None:
+        print(f'Use loss weight: {weight}')
+    winlen = n_fft
+    window = torch.hamming_window(winlen, periodic=False)
+
+    def stft(x, return_complex=False):
+        # returns [batch, bins, frames, 2]
+        return torch.stft(
+            x,
+            n_fft,
+            hop_length,
+            winlen,
+            window=window.to(x.device),
+            center=False,
+            return_complex=return_complex)
+
+    def istft(x, slen):
+        return torch.istft(
+            x,
+            n_fft,
+            hop_length,
+            winlen,
+            window=window.to(x.device),
+            center=False,
+            length=slen)
+
+    def mask_loss(targets, masks, nframes):
+        ''' [Batch, Time, Frequency]
+        '''
+        with torch.no_grad():
+            mask_for_loss = torch.ones_like(targets)
+            for idx, num in enumerate(nframes):
+                mask_for_loss[idx, num:, :] = 0
+        masks = masks * mask_for_loss
+        targets = targets * mask_for_loss
+
+        if weight is None:
+            alpha = 1
+        else:  # for aec ST
+            alpha = weight - targets
+
+        if loss_type == 'mse':
+            loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2))
+        elif loss_type == 'mae':
+            loss = torch.sum(alpha * torch.abs(targets - masks))
+        else:  # mse(mask), mae(mask) approx 1:2
+            loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2)
+                                   + 0.1 * alpha * torch.abs(targets - masks))
+        loss /= torch.sum(nframes)
+        return loss
+
+    def spectrum_loss(targets, spec, nframes):
+        ''' [Batch, Time, Frequency, 2]
+        '''
+        with torch.no_grad():
+            mask_for_loss = torch.ones_like(targets[..., 0])
+            for idx, num in enumerate(nframes):
+                mask_for_loss[idx, num:, :] = 0
+        xr = spec[..., 0] * mask_for_loss
+        xi = spec[..., 1] * mask_for_loss
+        yr = targets[..., 0] * mask_for_loss
+        yi = targets[..., 1] * mask_for_loss
+        xmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) * mask_for_loss
+        ymag = torch.sqrt(targets[..., 0]**2
+                          + targets[..., 1]**2) * mask_for_loss
+
+        loss1 = torch.sum(torch.pow(xr - yr, 2) + torch.pow(xi - yi, 2))
+        loss2 = torch.sum(torch.pow(xmag - ymag, 2))
+
+        loss = (loss1 + loss2) / torch.sum(nframes)
+        return loss
+
+    def sa_loss_dlen(mixed, clean, masks, nframes):
+        yspec = stft(mixed).permute([0, 2, 1, 3]) / 32768
+        xspec = stft(clean).permute([0, 2, 1, 3]) / 32768
+        with torch.no_grad():
+            mask_for_loss = torch.ones_like(xspec[..., 0])
+            for idx, num in enumerate(nframes):
+                mask_for_loss[idx, num:, :] = 0
+        emag = ((yspec[..., 0]**2 + yspec[..., 1]**2)**0.15) * (masks**0.3)
+        xmag = (xspec[..., 0]**2 + xspec[..., 1]**2)**0.15
+        emag = emag * mask_for_loss
+        xmag = xmag * mask_for_loss
+
+        loss = torch.sum(torch.pow(emag - xmag, 2)) / torch.sum(nframes)
+        return loss
+
+    def psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask=None):
+        mixed_spec = stft(mixed)
+        clean_spec = stft(clean)
+        targets = compute_mask(mixed_spec, clean_spec, mask_type)
+        # [B, T, F]
+        targets = targets.permute(0, 2, 1)
+
+        loss = mask_loss(targets, masks, nframes)
+
+        if subtask is not None:
+            vadtargets = energy_vad(clean_spec)
+            with torch.no_grad():
+                mask_for_loss = torch.ones_like(targets[:, :, 0])
+                for idx, num in enumerate(nframes):
+                    mask_for_loss[idx, num:] = 0
+            subtask = subtask[:, :, 0] * mask_for_loss
+            vadtargets = vadtargets * mask_for_loss
+
+            loss_vad = F.binary_cross_entropy(subtask, vadtargets)
+            return loss + loss_vad
+        return loss
+
+    def modulation_loss(mixed, clean, masks, nframes, subtask=None):
+        mixed_spec = stft(mixed, True)
+        clean_spec = stft(clean, True)
+        enhanced_mag = torch.abs(mixed_spec)
+        clean_mag = torch.abs(clean_spec)
+        with torch.no_grad():
+            mask_for_loss = torch.ones_like(clean_mag)
+            for idx, num in enumerate(nframes):
+                mask_for_loss[idx, :, num:] = 0
+        clean_mag = clean_mag * mask_for_loss
+        enhanced_mag = enhanced_mag * mask_for_loss * masks.permute([0, 2, 1])
+
+        # Covert to log-mel representation
+        # (B,T,#mel_channels)
+        clean_log_mel = torch.log(
+            torch.transpose(stft2mel(clean_mag**2), 2, 1) + 1e-8)
+        enhanced_log_mel = torch.log(
+            torch.transpose(stft2mel(enhanced_mag**2), 2, 1) + 1e-8)
+
+        alpha = compute_mask(mixed_spec, clean_spec, mask_type)
+        alpha = alpha.permute(0, 2, 1)
+        loss = 0.05 * modulation_loss_module(enhanced_log_mel, clean_log_mel,
+                                             alpha)
+        loss2 = psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask)
+        # print(loss.item(), loss2.item()) #approx 1:4
+        loss = loss + loss2
+        return loss
+
+    def wav2vec_loss(mixed, clean, masks, nframes, subtask=None):
+        mixed /= 32768
+        clean /= 32768
+        mixed_spec = stft(mixed)
+        with torch.no_grad():
+            mask_for_loss = torch.ones_like(masks)
+            for idx, num in enumerate(nframes):
+                mask_for_loss[idx, num:, :] = 0
+        masks_est = masks * mask_for_loss
+
+        estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
+        est_clean = istft(estimate, clean.shape[1])
+        loss = wav2vec_loss_module(est_clean, clean)
+        return loss
+
+    def sisdr_loss_dlen(mixed,
+                        clean,
+                        masks,
+                        nframes,
+                        subtask=None,
+                        zero_mean=True):
+        mixed_spec = stft(mixed)
+        with torch.no_grad():
+            mask_for_loss = torch.ones_like(masks)
+            for idx, num in enumerate(nframes):
+                mask_for_loss[idx, num:, :] = 0
+        masks_est = masks * mask_for_loss
+
+        estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
+        est_clean = istft(estimate, clean.shape[1])
+        flen = min(clean.shape[1], est_clean.shape[1])
+        clean = clean[:, :flen]
+        est_clean = est_clean[:, :flen]
+
+        # follow asteroid/losses/sdr.py
+        if zero_mean:
+            clean = clean - torch.mean(clean, dim=1, keepdim=True)
+            est_clean = est_clean - torch.mean(est_clean, dim=1, keepdim=True)
+
+        dot = torch.sum(est_clean * clean, dim=1, keepdim=True)
+        s_clean_energy = torch.sum(clean**2, dim=1, keepdim=True) + EPS
+        scaled_clean = dot * clean / s_clean_energy
+        e_noise = est_clean - scaled_clean
+
+        # [batch]
+        sisdr = torch.sum(
+            scaled_clean**2, dim=1) / (
+                torch.sum(e_noise**2, dim=1) + EPS)
+        sisdr = -10 * torch.log10(sisdr + EPS)
+        loss = sisdr.mean()
+        return loss
+
+    def sisdr_freq_loss_dlen(mixed, clean, masks, nframes, subtask=None):
+        mixed_spec = stft(mixed)
+        clean_spec = stft(clean)
+        with torch.no_grad():
+            mask_for_loss = torch.ones_like(masks)
+            for idx, num in enumerate(nframes):
+                mask_for_loss[idx, num:, :] = 0
+        masks_est = masks * mask_for_loss
+
+        estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
+
+        dot_real = estimate[..., 0] * clean_spec[..., 0] + \
+            estimate[..., 1] * clean_spec[..., 1]
+        dot_imag = estimate[..., 0] * clean_spec[..., 1] - \
+            estimate[..., 1] * clean_spec[..., 0]
+        dot = torch.cat([dot_real.unsqueeze(3), dot_imag.unsqueeze(3)], dim=-1)
+        s_clean_energy = clean_spec[..., 0] ** 2 + \
+            clean_spec[..., 1] ** 2 + EPS
+        scaled_clean = dot * clean_spec / s_clean_energy.unsqueeze(3)
+        e_noise = estimate - scaled_clean
+
+        # [batch]
+        scaled_clean_energy = torch.sum(
+            scaled_clean[..., 0]**2 + scaled_clean[..., 1]**2, dim=1)
+        e_noise_energy = torch.sum(
+            e_noise[..., 0]**2 + e_noise[..., 1]**2, dim=1)
+        sisdr = torch.sum(
+            scaled_clean_energy, dim=1) / (
+                torch.sum(e_noise_energy, dim=1) + EPS)
+        sisdr = -10 * torch.log10(sisdr + EPS)
+        loss = sisdr.mean()
+        return loss
+
+    def crm_loss_dlen(mixed, clean, masks, nframes, subtask=None):
+        mixed_spec = stft(mixed).permute([0, 2, 1, 3])
+        clean_spec = stft(clean).permute([0, 2, 1, 3])
+        mixed_spec = mixed_spec / 32768
+        clean_spec = clean_spec / 32768
+        tgt_mr, tgt_mi = compute_mask(mixed_spec, clean_spec, mask_type='crm')
+
+        D = int(masks.shape[2] / 2)
+        with torch.no_grad():
+            mask_for_loss = torch.ones_like(clean_spec[..., 0])
+            for idx, num in enumerate(nframes):
+                mask_for_loss[idx, num:, :] = 0
+        mr = masks[..., :D] * mask_for_loss
+        mi = masks[..., D:] * mask_for_loss
+        tgt_mr = tgt_mr * mask_for_loss
+        tgt_mi = tgt_mi * mask_for_loss
+
+        if weight is None:
+            alpha = 1
+        else:
+            alpha = weight - tgt_mr
+        # signal approximation
+        yr = mixed_spec[..., 0]
+        yi = mixed_spec[..., 1]
+        loss1 = torch.sum(alpha * torch.pow((mr * yr - mi * yi) - clean_spec[..., 0], 2)) \
+            + torch.sum(alpha * torch.pow((mr * yi + mi * yr) - clean_spec[..., 1], 2))
+        # mask approximation
+        loss2 = torch.sum(alpha * torch.pow(mr - tgt_mr, 2)) \
+            + torch.sum(alpha * torch.pow(mi - tgt_mi, 2))
+        loss = 0.5 * (loss1 + loss2) / torch.sum(nframes)
+        return loss
+
+    def crm_miso_loss_dlen(mixed, clean, masks, nframes):
+        return crm_loss_dlen(mixed[..., 0], clean[..., 0], masks, nframes)
+
+    def mimo_loss_dlen(mixed, clean, masks, nframes):
+        chs = mixed.shape[-1]
+        D = masks.shape[2] // chs
+        loss = psm_vad_loss_dlen(mixed[..., 0], clean[..., 0], masks[..., :D],
+                                 nframes)
+        for ch in range(1, chs):
+            loss1 = psm_vad_loss_dlen(mixed[..., ch], clean[..., ch],
+                                      masks[..., ch * D:ch * D + D], nframes)
+            loss = loss + loss1
+        return loss / chs
+
+    def spec_loss_dlen(mixed, clean, spec, nframes):
+        clean_spec = stft(clean).permute([0, 2, 1, 3])
+        clean_spec = clean_spec / 32768
+
+        D = spec.shape[2] // 2
+        spec_est = torch.cat([spec[..., :D, None], spec[..., D:, None]],
+                             dim=-1)
+        loss = spectrum_loss(clean_spec, spec_est, nframes)
+        return loss
+
+    if loss_func == 'psm_vad_loss_dlen':
+        return psm_vad_loss_dlen
+    elif loss_func == 'sisdr_loss_dlen':
+        return sisdr_loss_dlen
+    elif loss_func == 'sisdr_freq_loss_dlen':
+        return sisdr_freq_loss_dlen
+    elif loss_func == 'crm_loss_dlen':
+        return crm_loss_dlen
+    elif loss_func == 'modulation_loss':
+        return modulation_loss
+    elif loss_func == 'wav2vec_loss':
+        return wav2vec_loss
+    elif loss_func == 'mimo_loss_dlen':
+        return mimo_loss_dlen
+    elif loss_func == 'spec_loss_dlen':
+        return spec_loss_dlen
+    elif loss_func == 'sa_loss_dlen':
+        return sa_loss_dlen
+    else:
+        print('error loss func')
+        return None
--- a/modelscope/models/audio/network/modulation_loss.py
+++ b/modelscope/models/audio/network/modulation_loss.py
@@ -0,0 +1,248 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchaudio.transforms import MelScale
+
+
+class ModulationDomainLossModule(torch.nn.Module):
+    """Modulation-domain loss function developed in [1] for supervised speech enhancement
+
+        In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
+        as the input spectrogram representation.
+        Specific parameter details are in the paper and in the example below
+
+        Parameters
+        ----------
+        modulation_kernels: nn.Module
+            Differentiable module that transforms a spectrogram representation to the modulation domain
+
+            modulation_domain = modulation_kernels(input_tf_representation)
+            Input Spectrogram representation (B, T, F) ---> |(M) modulation_kernels|--->Modulation Domain(B, M, T', F')
+
+        norm: boolean
+            Normalizes the modulation domain representation to be 0 mean across time
+
+        [1] T. Vuong, Y. Xia, and R. M. Stern, “A modulation-domain lossfor neural-network-based real-time
+         speech enhancement”
+            Accepted ICASSP 2021, https://arxiv.org/abs/2102.07330
+
+
+    """
+
+    def __init__(self, modulation_kernels, norm=True):
+        super(ModulationDomainLossModule, self).__init__()
+
+        self.modulation_kernels = modulation_kernels
+        self.mse = nn.MSELoss(reduce=False)
+        self.norm = norm
+
+    def forward(self, enhanced_spect, clean_spect, weight=None):
+        """Calculate modulation-domain loss
+        Args:
+            enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
+            clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
+        Returns:
+            Tensor: Modulation-domain loss value.
+        """
+
+        clean_mod = self.modulation_kernels(clean_spect)
+        enhanced_mod = self.modulation_kernels(enhanced_spect)
+
+        if self.norm:
+            mean_clean_mod = torch.mean(clean_mod, dim=2)
+            mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)
+
+            clean_mod = clean_mod - mean_clean_mod.unsqueeze(2)
+            enhanced_mod = enhanced_mod - mean_enhanced_mod.unsqueeze(2)
+
+        if weight is None:
+            alpha = 1
+        else:  # TF-mask weight
+            alpha = 1 + torch.sum(weight, dim=-1, keepdim=True).unsqueeze(1)
+        mod_mse_loss = self.mse(enhanced_mod, clean_mod) * alpha
+        mod_mse_loss = torch.mean(
+            torch.sum(mod_mse_loss, dim=(1, 2, 3))
+            / torch.sum(clean_mod**2, dim=(1, 2, 3)))
+
+        return mod_mse_loss
+
+
+class ModulationDomainNCCLossModule(torch.nn.Module):
+    """Modulation-domain loss function developed in [1] for supervised speech enhancement
+
+        # Speech Intelligibility Prediction Using Spectro-Temporal Modulation Analysis - based off of this
+
+        In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
+        as the input spectrogram representation.
+        Specific parameter details are in the paper and in the example below
+
+        Parameters
+        ----------
+        modulation_kernels: nn.Module
+            Differentiable module that transforms a spectrogram representation to the modulation domain
+
+            modulation_domain = modulation_kernels(input_tf_representation)
+            Input Spectrogram representation(B, T, F) --- (M) modulation_kernels---> Modulation Domain(B, M, T', F')
+
+        [1]
+
+    """
+
+    def __init__(self, modulation_kernels):
+        super(ModulationDomainNCCLossModule, self).__init__()
+
+        self.modulation_kernels = modulation_kernels
+        self.mse = nn.MSELoss(reduce=False)
+
+    def forward(self, enhanced_spect, clean_spect):
+        """Calculate modulation-domain loss
+        Args:
+            enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
+            clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
+        Returns:
+            Tensor: Modulation-domain loss value.
+        """
+
+        clean_mod = self.modulation_kernels(clean_spect)
+        enhanced_mod = self.modulation_kernels(enhanced_spect)
+        mean_clean_mod = torch.mean(clean_mod, dim=2)
+        mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)
+
+        normalized_clean = clean_mod - mean_clean_mod.unsqueeze(2)
+        normalized_enhanced = enhanced_mod - mean_enhanced_mod.unsqueeze(2)
+
+        inner_product = torch.sum(
+            normalized_clean * normalized_enhanced, dim=2)
+        normalized_denom = (torch.sum(
+            normalized_clean * normalized_clean, dim=2))**.5 * (torch.sum(
+                normalized_enhanced * normalized_enhanced, dim=2))**.5
+
+        ncc = inner_product / normalized_denom
+        mod_mse_loss = torch.mean((ncc - 1.0)**2)
+
+        return mod_mse_loss
+
+
+class GaborSTRFConv(nn.Module):
+    """Gabor-STRF-based cross-correlation kernel."""
+
+    def __init__(self,
+                 supn,
+                 supk,
+                 nkern,
+                 rates=None,
+                 scales=None,
+                 norm_strf=True,
+                 real_only=False):
+        """Instantiate a Gabor-based STRF convolution layer.
+        Parameters
+        ----------
+        supn: int
+            Time support in number of frames. Also the window length.
+        supk: int
+            Frequency support in number of channels. Also the window length.
+        nkern: int
+            Number of kernels, each with a learnable rate and scale.
+        rates: list of float, None
+            Initial values for temporal modulation.
+        scales: list of float, None
+            Initial values for spectral modulation.
+        norm_strf: Boolean
+            Normalize STRF kernels to be unit length
+        real_only: Boolean
+            If True, nkern REAL gabor-STRF kernels
+            If False, nkern//2 REAL and nkern//2 IMAGINARY gabor-STRF kernels
+        """
+        super(GaborSTRFConv, self).__init__()
+        self.numN = supn
+        self.numK = supk
+        self.numKern = nkern
+        self.real_only = real_only
+        self.norm_strf = norm_strf
+
+        if not real_only:
+            nkern = nkern // 2
+
+        if supk % 2 == 0:  # force odd number
+            supk += 1
+        self.supk = torch.arange(supk, dtype=torch.float32)
+        if supn % 2 == 0:  # force odd number
+            supn += 1
+        self.supn = torch.arange(supn, dtype=self.supk.dtype)
+        self.padding = (supn // 2, supk // 2)
+        # Set up learnable parameters
+        # for param in (rates, scales):
+        #    assert (not param) or len(param) == nkern
+        if not rates:
+
+            rates = torch.rand(nkern) * math.pi / 2.0
+
+        if not scales:
+
+            scales = (torch.rand(nkern) * 2.0 - 1.0) * math.pi / 2.0
+
+        self.rates_ = nn.Parameter(torch.Tensor(rates))
+        self.scales_ = nn.Parameter(torch.Tensor(scales))
+
+    def strfs(self):
+        """Make STRFs using the current parameters."""
+
+        if self.supn.device != self.rates_.device:  # for first run
+            self.supn = self.supn.to(self.rates_.device)
+            self.supk = self.supk.to(self.rates_.device)
+        n0, k0 = self.padding
+
+        nwind = .5 - .5 * \
+            torch.cos(2 * math.pi * (self.supn + 1) / (len(self.supn) + 1))
+        kwind = .5 - .5 * \
+            torch.cos(2 * math.pi * (self.supk + 1) / (len(self.supk) + 1))
+
+        new_wind = torch.matmul((nwind).unsqueeze(-1), (kwind).unsqueeze(0))
+
+        n_n_0 = self.supn - n0
+        k_k_0 = self.supk - k0
+        n_mult = torch.matmul(
+            n_n_0.unsqueeze(1),
+            torch.ones((1, len(self.supk))).type(torch.FloatTensor).to(
+                self.rates_.device))
+        k_mult = torch.matmul(
+            torch.ones((len(self.supn),
+                        1)).type(torch.FloatTensor).to(self.rates_.device),
+            k_k_0.unsqueeze(0))
+
+        inside = self.rates_.unsqueeze(1).unsqueeze(
+            1) * n_mult + self.scales_.unsqueeze(1).unsqueeze(1) * k_mult
+        real_strf = torch.cos(inside) * new_wind.unsqueeze(0)
+
+        if self.real_only:
+            final_strf = real_strf
+
+        else:
+            imag_strf = torch.sin(inside) * new_wind.unsqueeze(0)
+            final_strf = torch.cat([real_strf, imag_strf], dim=0)
+
+        if self.norm_strf:
+            final_strf = final_strf / (torch.sum(
+                final_strf**2, dim=(1, 2)).unsqueeze(1).unsqueeze(2))**.5
+
+        return final_strf
+
+    def forward(self, sigspec):
+        """Forward pass a batch of (real) spectra [Batch x Time x Frequency]."""
+        if len(sigspec.shape) == 2:  # expand batch dimension if single eg
+            sigspec = sigspec.unsqueeze(0)
+        strfs = self.strfs().unsqueeze(1).type_as(sigspec)
+        out = F.conv2d(sigspec.unsqueeze(1), strfs, padding=self.padding)
+        return out
+
+    def __repr__(self):
+        """Gabor filter"""
+        report = """
+            +++++ Gabor Filter Kernels [{}], supn[{}], supk[{}] real only [{}] norm strf [{}] +++++
+
+        """.format(self.numKern, self.numN, self.numK, self.real_only,
+                   self.norm_strf)
+
+        return report
--- a/modelscope/models/audio/network/se_net.py
+++ b/modelscope/models/audio/network/se_net.py
@@ -0,0 +1,483 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..layers.activations import RectifiedLinear, Sigmoid
+from ..layers.affine_transform import AffineTransform
+from ..layers.deep_fsmn import DeepFsmn
+from ..layers.uni_deep_fsmn import Conv2d, UniDeepFsmn
+
+
+class MaskNet(nn.Module):
+
+    def __init__(self,
+                 indim,
+                 outdim,
+                 layers=9,
+                 hidden_dim=128,
+                 hidden_dim2=None,
+                 lorder=20,
+                 rorder=0,
+                 dilation=1,
+                 layer_norm=False,
+                 dropout=0,
+                 crm=False,
+                 vad=False,
+                 linearout=False):
+        super(MaskNet, self).__init__()
+
+        self.linear1 = AffineTransform(indim, hidden_dim)
+        self.relu = RectifiedLinear(hidden_dim, hidden_dim)
+        if hidden_dim2 is None:
+            hidden_dim2 = hidden_dim
+
+        if rorder == 0:
+            repeats = [
+                UniDeepFsmn(
+                    hidden_dim,
+                    hidden_dim,
+                    lorder,
+                    hidden_dim2,
+                    dilation=dilation,
+                    layer_norm=layer_norm,
+                    dropout=dropout) for i in range(layers)
+            ]
+        else:
+            repeats = [
+                DeepFsmn(
+                    hidden_dim,
+                    hidden_dim,
+                    lorder,
+                    rorder,
+                    hidden_dim2,
+                    layer_norm=layer_norm,
+                    dropout=dropout) for i in range(layers)
+            ]
+        self.deepfsmn = nn.Sequential(*repeats)
+
+        self.linear2 = AffineTransform(hidden_dim, outdim)
+
+        self.crm = crm
+        if self.crm:
+            self.sig = nn.Tanh()
+        else:
+            self.sig = Sigmoid(outdim, outdim)
+
+        self.vad = vad
+        if self.vad:
+            self.linear3 = AffineTransform(hidden_dim, 1)
+
+        self.layers = layers
+        self.linearout = linearout
+        if self.linearout and self.vad:
+            print('Warning: not supported nnet')
+
+    def forward(self, feat, ctl=None):
+        x1 = self.linear1(feat)
+        x2 = self.relu(x1)
+        if ctl is not None:
+            ctl = min(ctl, self.layers - 1)
+            for i in range(ctl):
+                x2 = self.deepfsmn[i](x2)
+            mask = self.sig(self.linear2(x2))
+            if self.vad:
+                vad = torch.sigmoid(self.linear3(x2))
+                return mask, vad
+            else:
+                return mask
+        x3 = self.deepfsmn(x2)
+        if self.linearout:
+            return self.linear2(x3)
+        mask = self.sig(self.linear2(x3))
+        if self.vad:
+            vad = torch.sigmoid(self.linear3(x3))
+            return mask, vad
+        else:
+            return mask
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<Nnet>\n'
+        re_str += self.linear1.to_kaldi_nnet()
+        re_str += self.relu.to_kaldi_nnet()
+        for dfsmn in self.deepfsmn:
+            re_str += dfsmn.to_kaldi_nnet()
+        re_str += self.linear2.to_kaldi_nnet()
+        re_str += self.sig.to_kaldi_nnet()
+        re_str += '</Nnet>\n'
+
+        return re_str
+
+    def to_raw_nnet(self, fid):
+        self.linear1.to_raw_nnet(fid)
+        for dfsmn in self.deepfsmn:
+            dfsmn.to_raw_nnet(fid)
+        self.linear2.to_raw_nnet(fid)
+
+
+class StageNet(nn.Module):
+
+    def __init__(self,
+                 indim,
+                 outdim,
+                 layers=9,
+                 layers2=6,
+                 hidden_dim=128,
+                 lorder=20,
+                 rorder=0,
+                 layer_norm=False,
+                 dropout=0,
+                 crm=False,
+                 vad=False,
+                 linearout=False):
+        super(StageNet, self).__init__()
+
+        self.stage1 = nn.ModuleList()
+        self.stage2 = nn.ModuleList()
+        layer = nn.Sequential(nn.Linear(indim, hidden_dim), nn.ReLU())
+        self.stage1.append(layer)
+        for i in range(layers):
+            layer = UniDeepFsmn(
+                hidden_dim,
+                hidden_dim,
+                lorder,
+                hidden_dim,
+                layer_norm=layer_norm,
+                dropout=dropout)
+            self.stage1.append(layer)
+        layer = nn.Sequential(nn.Linear(hidden_dim, 321), nn.Sigmoid())
+        self.stage1.append(layer)
+        # stage2
+        layer = nn.Sequential(nn.Linear(321 + indim, hidden_dim), nn.ReLU())
+        self.stage2.append(layer)
+        for i in range(layers2):
+            layer = UniDeepFsmn(
+                hidden_dim,
+                hidden_dim,
+                lorder,
+                hidden_dim,
+                layer_norm=layer_norm,
+                dropout=dropout)
+            self.stage2.append(layer)
+        layer = nn.Sequential(
+            nn.Linear(hidden_dim, outdim),
+            nn.Sigmoid() if not crm else nn.Tanh())
+        self.stage2.append(layer)
+        self.crm = crm
+        self.vad = vad
+        self.linearout = linearout
+        self.window = torch.hamming_window(640, periodic=False).cuda()
+        self.freezed = False
+
+    def freeze(self):
+        if not self.freezed:
+            for param in self.stage1.parameters():
+                param.requires_grad = False
+            self.freezed = True
+            print('freezed stage1')
+
+    def forward(self, feat, mixture, ctl=None):
+        if ctl == 'off':
+            x = feat
+            for i in range(len(self.stage1)):
+                x = self.stage1[i](x)
+            return x
+        else:
+            self.freeze()
+            x = feat
+            for i in range(len(self.stage1)):
+                x = self.stage1[i](x)
+
+            spec = torch.stft(
+                mixture / 32768,
+                640,
+                320,
+                640,
+                self.window,
+                center=False,
+                return_complex=True)
+            spec = torch.view_as_real(spec).permute([0, 2, 1, 3])
+            specmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2)
+            est = x * specmag
+            y = torch.cat([est, feat], dim=-1)
+            for i in range(len(self.stage2)):
+                y = self.stage2[i](y)
+            return y
+
+
+class Unet(nn.Module):
+
+    def __init__(self,
+                 indim,
+                 outdim,
+                 layers=9,
+                 dims=[256] * 4,
+                 lorder=20,
+                 rorder=0,
+                 dilation=1,
+                 layer_norm=False,
+                 dropout=0,
+                 crm=False,
+                 vad=False,
+                 linearout=False):
+        super(Unet, self).__init__()
+
+        self.linear1 = AffineTransform(indim, dims[0])
+        self.relu = RectifiedLinear(dims[0], dims[0])
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        for i in range(len(dims) - 1):
+            layer = nn.Sequential(
+                nn.Linear(dims[i], dims[i + 1]), nn.ReLU(),
+                nn.Linear(dims[i + 1], dims[i + 1], bias=False),
+                Conv2d(
+                    dims[i + 1],
+                    dims[i + 1],
+                    lorder,
+                    groups=dims[i + 1],
+                    skip_connect=True))
+            self.encoder.append(layer)
+        for i in range(len(dims) - 1, 0, -1):
+            layer = nn.Sequential(
+                nn.Linear(dims[i] * 2, dims[i - 1]), nn.ReLU(),
+                nn.Linear(dims[i - 1], dims[i - 1], bias=False),
+                Conv2d(
+                    dims[i - 1],
+                    dims[i - 1],
+                    lorder,
+                    groups=dims[i - 1],
+                    skip_connect=True))
+            self.decoder.append(layer)
+        self.tf = nn.ModuleList()
+        for i in range(layers - 2 * (len(dims) - 1)):
+            layer = nn.Sequential(
+                nn.Linear(dims[-1], dims[-1]), nn.ReLU(),
+                nn.Linear(dims[-1], dims[-1], bias=False),
+                Conv2d(
+                    dims[-1],
+                    dims[-1],
+                    lorder,
+                    groups=dims[-1],
+                    skip_connect=True))
+            self.tf.append(layer)
+
+        self.linear2 = AffineTransform(dims[0], outdim)
+        self.crm = crm
+        self.act = nn.Tanh() if self.crm else nn.Sigmoid()
+        self.vad = False
+        self.layers = layers
+        self.linearout = linearout
+
+    def forward(self, x, ctl=None):
+        x = self.linear1(x)
+        x = self.relu(x)
+
+        encoder_out = []
+        for i in range(len(self.encoder)):
+            x = self.encoder[i](x)
+            encoder_out.append(x)
+        for i in range(len(self.tf)):
+            x = self.tf[i](x)
+        for i in range(len(self.decoder)):
+            x = torch.cat([x, encoder_out[-1 - i]], dim=-1)
+            x = self.decoder[i](x)
+
+        x = self.linear2(x)
+        if self.linearout:
+            return x
+        return self.act(x)
+
+
+class BranchNet(nn.Module):
+
+    def __init__(self,
+                 indim,
+                 outdim,
+                 layers=9,
+                 hidden_dim=256,
+                 lorder=20,
+                 rorder=0,
+                 dilation=1,
+                 layer_norm=False,
+                 dropout=0,
+                 crm=False,
+                 vad=False,
+                 linearout=False):
+        super(BranchNet, self).__init__()
+
+        self.linear1 = AffineTransform(indim, hidden_dim)
+        self.relu = RectifiedLinear(hidden_dim, hidden_dim)
+
+        self.convs = nn.ModuleList()
+        self.deepfsmn = nn.ModuleList()
+        self.FREQ = nn.ModuleList()
+        self.TIME = nn.ModuleList()
+        self.br1 = nn.ModuleList()
+        self.br2 = nn.ModuleList()
+        for i in range(layers):
+            '''
+            layer = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim, bias=False),
+                Conv2d(hidden_dim, hidden_dim, lorder,
+                       groups=hidden_dim, skip_connect=True)
+            )
+            self.deepfsmn.append(layer)
+            '''
+            layer = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
+            self.FREQ.append(layer)
+            '''
+            layer = nn.GRU(hidden_dim, hidden_dim,
+                           batch_first=True,
+                           bidirectional=False)
+            self.TIME.append(layer)
+
+            layer = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim//2, bias=False),
+                Conv2d(hidden_dim//2, hidden_dim//2, lorder,
+                       groups=hidden_dim//2, skip_connect=True)
+            )
+            self.br1.append(layer)
+            layer = nn.GRU(hidden_dim, hidden_dim//2,
+                           batch_first=True,
+                           bidirectional=False)
+            self.br2.append(layer)
+            '''
+
+        self.linear2 = AffineTransform(hidden_dim, outdim)
+        self.crm = crm
+        self.act = nn.Tanh() if self.crm else nn.Sigmoid()
+        self.vad = False
+        self.layers = layers
+        self.linearout = linearout
+
+    def forward(self, x, ctl=None):
+        return self.forward_branch(x)
+
+    def forward_sepconv(self, x):
+        x = torch.unsqueeze(x, 1)
+        for i in range(len(self.convs)):
+            x = self.convs[i](x)
+            x = F.relu(x)
+        B, C, H, W = x.shape
+        x = x.permute(0, 2, 1, 3)
+        x = torch.reshape(x, [B, H, C * W])
+        x = self.linear1(x)
+        x = self.relu(x)
+        for i in range(self.layers):
+            x = self.deepfsmn[i](x) + x
+        x = self.linear2(x)
+        return self.act(x)
+
+    def forward_branch(self, x):
+        x = self.linear1(x)
+        x = self.relu(x)
+        for i in range(self.layers):
+            z = self.FREQ[i](x)
+            x = z + x
+        x = self.linear2(x)
+        if self.linearout:
+            return x
+        return self.act(x)
+
+
+class TACNet(nn.Module):
+    ''' transform average concatenate for ad hoc dr
+    '''
+
+    def __init__(self,
+                 indim,
+                 outdim,
+                 layers=9,
+                 hidden_dim=128,
+                 lorder=20,
+                 rorder=0,
+                 crm=False,
+                 vad=False,
+                 linearout=False):
+        super(TACNet, self).__init__()
+
+        self.linear1 = AffineTransform(indim, hidden_dim)
+        self.relu = RectifiedLinear(hidden_dim, hidden_dim)
+
+        if rorder == 0:
+            repeats = [
+                UniDeepFsmn(hidden_dim, hidden_dim, lorder, hidden_dim)
+                for i in range(layers)
+            ]
+        else:
+            repeats = [
+                DeepFsmn(hidden_dim, hidden_dim, lorder, rorder, hidden_dim)
+                for i in range(layers)
+            ]
+        self.deepfsmn = nn.Sequential(*repeats)
+
+        self.ch_transform = nn.ModuleList([])
+        self.ch_average = nn.ModuleList([])
+        self.ch_concat = nn.ModuleList([])
+        for i in range(layers):
+            self.ch_transform.append(
+                nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
+            self.ch_average.append(
+                nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
+            self.ch_concat.append(
+                nn.Sequential(
+                    nn.Linear(hidden_dim * 2, hidden_dim), nn.PReLU()))
+
+        self.linear2 = AffineTransform(hidden_dim, outdim)
+
+        self.crm = crm
+        if self.crm:
+            self.sig = nn.Tanh()
+        else:
+            self.sig = Sigmoid(outdim, outdim)
+
+        self.vad = vad
+        if self.vad:
+            self.linear3 = AffineTransform(hidden_dim, 1)
+
+        self.layers = layers
+        self.linearout = linearout
+        if self.linearout and self.vad:
+            print('Warning: not supported nnet')
+
+    def forward(self, feat, ctl=None):
+        B, T, F = feat.shape
+        # assume 4ch
+        ch = 4
+        zlist = []
+        for c in range(ch):
+            z = self.linear1(feat[..., c * (F // 4):(c + 1) * (F // 4)])
+            z = self.relu(z)
+            zlist.append(z)
+        for i in range(self.layers):
+            # forward
+            for c in range(ch):
+                zlist[c] = self.deepfsmn[i](zlist[c])
+
+            # transform
+            olist = []
+            for c in range(ch):
+                z = self.ch_transform[i](zlist[c])
+                olist.append(z)
+            # average
+            avg = 0
+            for c in range(ch):
+                avg = avg + olist[c]
+            avg = avg / ch
+            avg = self.ch_average[i](avg)
+            # concate
+            for c in range(ch):
+                tac = torch.cat([olist[c], avg], dim=-1)
+                tac = self.ch_concat[i](tac)
+                zlist[c] = zlist[c] + tac
+
+        for c in range(ch):
+            zlist[c] = self.sig(self.linear2(zlist[c]))
+        mask = torch.cat(zlist, dim=-1)
+        return mask
+
+    def to_kaldi_nnet(self):
+        pass
--- a/modelscope/models/base.py
+++ b/modelscope/models/base.py
@@ -2,14 +2,13 @@

 import os.path as osp
 from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple, Union
+from typing import Dict, Union

-from maas_hub.file_download import model_file_download
 from maas_hub.snapshot_download import snapshot_download

 from modelscope.models.builder import build_model
 from modelscope.utils.config import Config
-from modelscope.utils.constant import CONFIGFILE
+from modelscope.utils.constant import ModelFile
 from modelscope.utils.hub import get_model_cache_dir

 Tensor = Union['torch.Tensor', 'tf.Tensor']
@@ -21,16 +20,24 @@ class Model(ABC):
        self.model_dir = model_dir

    def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        return self.post_process(self.forward(input))
+        return self.postprocess(self.forward(input))

    @abstractmethod
    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        pass

-    def post_process(self, input: Dict[str, Tensor],
-                     **kwargs) -> Dict[str, Tensor]:
-        # model specific postprocess, implementation is optional
-        # will be called in Pipeline and evaluation loop(in the future)
+    def postprocess(self, input: Dict[str, Tensor],
+                    **kwargs) -> Dict[str, Tensor]:
+        """ Model specific postprocess and convert model output to
+        standard model outputs.
+
+        Args:
+            inputs:  input data
+
+        Return:
+            dict of results:  a dict containing outputs of model, each
+                output should have the standard output name.
+        """
        return input

    @classmethod
@@ -47,7 +54,8 @@ class Model(ABC):
            #     raise ValueError(
            #         'Remote model repo {model_name_or_path} does not exists')

-        cfg = Config.from_file(osp.join(local_model_dir, CONFIGFILE))
+        cfg = Config.from_file(
+            osp.join(local_model_dir, ModelFile.CONFIGURATION))
        task_name = cfg.task
        model_cfg = cfg.model
        # TODO @wenmeng.zwm may should manually initialize model after model building
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -1,4 +1,6 @@
-from .sequence_classification_model import *  # noqa F403
+from .bert_for_sequence_classification import *  # noqa F403
+from .palm_for_text_generation import *  # noqa F403
+from .sbert_for_sentence_similarity import *  # noqa F403
+from .sbert_for_token_classification import *  # noqa F403
 from .space.dialog_intent_prediction_model import *  # noqa F403
 from .space.dialog_modeling_model import *  # noqa F403
-from .text_generation_model import *  # noqa F403
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ b/modelscope/models/nlp/bert_for_sequence_classification.py
@@ -1,5 +1,7 @@
+import os
 from typing import Any, Dict

+import json
 import numpy as np

 from modelscope.utils.constant import Tasks
@@ -34,6 +36,11 @@ class BertForSequenceClassification(Model):
                        ('token_type_ids', torch.LongTensor)],
            output_keys=['predictions', 'probabilities', 'logits'])

+        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
+        with open(self.label_path) as f:
+            self.label_mapping = json.load(f)
+        self.id2label = {idx: name for name, idx in self.label_mapping.items()}
+
    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
        """return the result by the model

@@ -50,3 +57,13 @@ class BertForSequenceClassification(Model):
                    }
        """
        return self.model.predict(input)
+
+    def postprocess(self, inputs: Dict[str, np.ndarray],
+                    **kwargs) -> Dict[str, np.ndarray]:
+        # N x num_classes
+        probs = inputs['probabilities']
+        result = {
+            'probs': probs,
+        }
+
+        return result
--- a/modelscope/models/nlp/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_for_text_generation.py
@@ -0,0 +1,43 @@
+from typing import Dict
+
+from modelscope.utils.constant import Tasks
+from ..base import Model, Tensor
+from ..builder import MODELS
+
+__all__ = ['PalmForTextGeneration']
+
+
+@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0')
+class PalmForTextGeneration(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the text generation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            model_cls (Optional[Any], optional): model loader, if None, use the
+                default loader to load model weights, by default None.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+
+        from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
+        model = PalmForConditionalGeneration.from_pretrained(model_dir)
+        self.tokenizer = model.tokenizer
+        self.generator = Translator(model)
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+                    {
+                        'predictions': Tensor([[1377, 4959, 2785, 6392...])]), # tokens need to be decode by tokenizer
+                    }
+        """
+
+        return self.generator(**input)
--- a/modelscope/models/nlp/sbert_for_sentence_similarity.py
+++ b/modelscope/models/nlp/sbert_for_sentence_similarity.py
@@ -0,0 +1,88 @@
+import os
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+from sofa import SbertModel
+from sofa.models.sbert.modeling_sbert import SbertPreTrainedModel
+from torch import nn
+
+from modelscope.utils.constant import Tasks
+from ..base import Model, Tensor
+from ..builder import MODELS
+
+__all__ = ['SbertForSentenceSimilarity']
+
+
+class SbertTextClassifier(SbertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.encoder = SbertModel(config, add_pooling_layer=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids=None, token_type_ids=None):
+        outputs = self.encoder(
+            input_ids,
+            token_type_ids=token_type_ids,
+            return_dict=None,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        return logits
+
+
+@MODELS.register_module(
+    Tasks.sentence_similarity,
+    module_name=r'sbert-base-chinese-sentence-similarity')
+class SbertForSentenceSimilarity(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the sentence similarity model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            model_cls (Optional[Any], optional): model loader, if None, use the
+                default loader to load model weights, by default None.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+
+        self.model = SbertTextClassifier.from_pretrained(
+            model_dir, num_labels=2)
+        self.model.eval()
+        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
+        with open(self.label_path) as f:
+            self.label_mapping = json.load(f)
+        self.id2label = {idx: name for name, idx in self.label_mapping.items()}
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Any]): the preprocessed data
+
+        Returns:
+            Dict[str, np.ndarray]: results
+                Example:
+                    {
+                        'predictions': array([1]), # lable 0-negative 1-positive
+                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
+                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
+                    }
+        """
+        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
+        token_type_ids = torch.tensor(
+            input['token_type_ids'], dtype=torch.long)
+        with torch.no_grad():
+            logits = self.model(input_ids, token_type_ids)
+        probs = logits.softmax(-1).numpy()
+        pred = logits.argmax(-1).numpy()
+        logits = logits.numpy()
+        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
+        return res
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ b/modelscope/models/nlp/sbert_for_token_classification.py
@@ -0,0 +1,56 @@
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+from sofa import SbertConfig, SbertForTokenClassification
+
+from modelscope.utils.constant import Tasks
+from ..base import Model, Tensor
+from ..builder import MODELS
+
+__all__ = ['StructBertForTokenClassification']
+
+
+@MODELS.register_module(
+    Tasks.word_segmentation,
+    module_name=r'structbert-chinese-word-segmentation')
+class StructBertForTokenClassification(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the word segmentation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            model_cls (Optional[Any], optional): model loader, if None, use the
+                default loader to load model weights, by default None.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+        self.model = SbertForTokenClassification.from_pretrained(
+            self.model_dir)
+        self.config = SbertConfig.from_pretrained(self.model_dir)
+
+    def forward(self, input: Dict[str,
+                                  Any]) -> Dict[str, Union[str, np.ndarray]]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Any]): the preprocessed data
+
+        Returns:
+            Dict[str, Union[str,np.ndarray]]: results
+                Example:
+                    {
+                        'predictions': array([1,4]), # lable 0-negative 1-positive
+                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
+                        'text': str(今天),
+                    }
+        """
+        input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
+        output = self.model(input_ids)
+        logits = output.logits
+        pred = torch.argmax(logits[0], dim=-1)
+        pred = pred.numpy()
+
+        rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
+        return rst
--- a/modelscope/models/nlp/text_generation_model.py
+++ b/modelscope/models/nlp/text_generation_model.py
@@ -1,52 +0,0 @@
-from typing import Any, Dict
-
-from modelscope.utils.constant import Tasks
-from ..base import Model, Tensor
-from ..builder import MODELS
-
-__all__ = ['PalmForTextGenerationModel']
-
-
-@MODELS.register_module(Tasks.text_generation, module_name=r'palm')
-class PalmForTextGenerationModel(Model):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the text generation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
-        """
-        from sofa import PalmTokenizer
-
-        super().__init__(model_dir, *args, **kwargs)
-        self.model_dir = model_dir
-
-        from sofa.models.palm import PalmForConditionalGeneration, TextGenerator
-        tokenizer = kwargs.pop('tokenizer',
-                               PalmTokenizer.from_pretrained(model_dir))
-        model = PalmForConditionalGeneration.from_pretrained(model_dir)
-        self.generator = TextGenerator(model, tokenizer)
-
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-                Example:
-                    {
-                        'predictions': array([1]), # lable 0-negative 1-positive
-                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                    }
-        """
-
-        encoder_inputs = [
-            input['input_ids'], input['token_type_ids'],
-            input['attention_mask']
-        ]
-        return self.generator(encoder_inputs)
--- a/modelscope/pipelines/init.py
+++ b/modelscope/pipelines/init.py
@@ -1,4 +1,4 @@
-from .audio import *  # noqa F403
+from .audio import LinearAECPipeline
 from .base import Pipeline
 from .builder import pipeline
 from .cv import *  # noqa F403
--- a/modelscope/pipelines/audio/init.py
+++ b/modelscope/pipelines/audio/init.py
@@ -0,0 +1 @@
+from .linear_aec_pipeline import LinearAECPipeline
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -0,0 +1,160 @@
+import importlib
+import os
+from typing import Any, Dict
+
+import numpy as np
+import scipy.io.wavfile as wav
+import torch
+import yaml
+
+from modelscope.preprocessors.audio import LinearAECAndFbank
+from modelscope.utils.constant import ModelFile, Tasks
+from ..base import Pipeline
+from ..builder import PIPELINES
+
+FEATURE_MVN = 'feature.DEY.mvn.txt'
+
+CONFIG_YAML = 'dey_mini.yaml'
+
+
+def initialize_config(module_cfg):
+    r"""According to config items, load specific module dynamically with params.
+        1. Load the module corresponding to the "module" param.
+        2. Call function (or instantiate class) corresponding to the "main" param.
+        3. Send the param (in "args") into the function (or class) when calling ( or instantiating).
+
+    Args:
+        module_cfg (dict): config items, eg:
+            {
+                "module": "models.model",
+                "main": "Model",
+                "args": {...}
+            }
+
+    Returns:
+        the module loaded.
+    """
+    module = importlib.import_module(module_cfg['module'])
+    return getattr(module, module_cfg['main'])(**module_cfg['args'])
+
+
+@PIPELINES.register_module(
+    Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k')
+class LinearAECPipeline(Pipeline):
+    r"""AEC Inference Pipeline only support 16000 sample rate.
+
+    When invoke the class with pipeline.__call__(), you should provide two params:
+        Dict[str, Any]
+            the path of wav files，eg:{
+            "nearend_mic": "/your/data/near_end_mic_audio.wav",
+            "farend_speech": "/your/data/far_end_speech_audio.wav"}
+        output_path (str, optional): "/your/output/audio_after_aec.wav"
+            the file path to write generate audio.
+    """
+
+    def __init__(self, model):
+        r"""
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        self.use_cuda = torch.cuda.is_available()
+        with open(
+                os.path.join(self.model, CONFIG_YAML), encoding='utf-8') as f:
+            self.config = yaml.full_load(f.read())
+            self.config['io']['mvn'] = os.path.join(self.model, FEATURE_MVN)
+        self._init_model()
+        self.preprocessor = LinearAECAndFbank(self.config['io'])
+
+        n_fft = self.config['loss']['args']['n_fft']
+        hop_length = self.config['loss']['args']['hop_length']
+        winlen = n_fft
+        window = torch.hamming_window(winlen, periodic=False)
+
+        def stft(x):
+            return torch.stft(
+                x,
+                n_fft,
+                hop_length,
+                winlen,
+                center=False,
+                window=window.to(x.device),
+                return_complex=False)
+
+        def istft(x, slen):
+            return torch.istft(
+                x,
+                n_fft,
+                hop_length,
+                winlen,
+                window=window.to(x.device),
+                center=False,
+                length=slen)
+
+        self.stft = stft
+        self.istft = istft
+
+    def _init_model(self):
+        checkpoint = torch.load(
+            os.path.join(self.model, ModelFile.TORCH_MODEL_BIN_FILE),
+            map_location='cpu')
+        self.model = initialize_config(self.config['nnet'])
+        if self.use_cuda:
+            self.model = self.model.cuda()
+        self.model.load_state_dict(checkpoint)
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        r"""The AEC process.
+
+        Args:
+            inputs: dict={'feature': Tensor, 'base': Tensor}
+                'feature' feature of input audio.
+                'base' the base audio to mask.
+
+        Returns:
+            dict:
+                {
+                    'output_pcm': generated audio array
+                }
+        """
+        output_data = self._process(inputs['feature'], inputs['base'])
+        return {'output_pcm': output_data}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        r"""The post process. Will save audio to file, if the output_path is given.
+
+        Args:
+            inputs: dict:
+                {
+                    'output_pcm': generated audio array
+                }
+            kwargs: accept 'output_path' which is the path to write generated audio
+
+        Returns:
+            dict:
+                {
+                    'output_pcm': generated audio array
+                }
+        """
+        if 'output_path' in kwargs.keys():
+            wav.write(kwargs['output_path'], self.preprocessor.SAMPLE_RATE,
+                      inputs['output_pcm'].astype(np.int16))
+        inputs['output_pcm'] = inputs['output_pcm'] / 32768.0
+        return inputs
+
+    def _process(self, fbanks, mixture):
+        if self.use_cuda:
+            fbanks = fbanks.cuda()
+            mixture = mixture.cuda()
+        if self.model.vad:
+            with torch.no_grad():
+                masks, vad = self.model(fbanks.unsqueeze(0))
+                masks = masks.permute([2, 1, 0])
+        else:
+            with torch.no_grad():
+                masks = self.model(fbanks.unsqueeze(0))
+                masks = masks.permute([2, 1, 0])
+        spectrum = self.stft(mixture)
+        masked_spec = spectrum * masks
+        masked_sig = self.istft(masked_spec, len(mixture)).cpu().numpy()
+        return masked_sig
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -12,10 +12,11 @@ from modelscope.pydatasets import PyDataset
 from modelscope.utils.config import Config
 from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.logger import get_logger
+from .outputs import TASK_OUTPUTS
 from .util import is_model_name

 Tensor = Union['torch.Tensor', 'tf.Tensor']
-Input = Union[str, PyDataset, Dict, 'PIL.Image.Image', 'numpy.ndarray']
+Input = Union[str, tuple, dict, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
 InputModel = Union[str, Model]

 output_keys = [
@@ -106,8 +107,25 @@ class Pipeline(ABC):
        out = self.preprocess(input, **post_kwargs)
        out = self.forward(out)
        out = self.postprocess(out, **post_kwargs)
+        self._check_output(out)
        return out

+    def _check_output(self, input):
+        # this attribute is dynamically attached by registry
+        # when cls is registered in registry using task name
+        task_name = self.group_key
+        if task_name not in TASK_OUTPUTS:
+            logger.warning(f'task {task_name} output keys are missing')
+            return
+        output_keys = TASK_OUTPUTS[task_name]
+        missing_keys = []
+        for k in output_keys:
+            if k not in input:
+                missing_keys.append(k)
+        if len(missing_keys) > 0:
+            raise ValueError(f'expected output keys are {output_keys}, '
+                             f'those {missing_keys} are missing')
+
    def preprocess(self, inputs: Input) -> Dict[str, Any]:
        """ Provide default implementation based on preprocess_cfg and user can reimplement it
        """
@@ -125,4 +143,14 @@ class Pipeline(ABC):

    @abstractmethod
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """ If current pipeline support model reuse, common postprocess
+            code should be write here.
+
+        Args:
+            inputs:  input data
+
+        Return:
+            dict of results:  a dict containing outputs of model, each
+                output should have the standard output name.
+        """
        raise NotImplementedError('postprocess')
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -3,24 +3,27 @@
 import os.path as osp
 from typing import List, Union

-import json
-from maas_hub.file_download import model_file_download
-
 from modelscope.models.base import Model
 from modelscope.utils.config import Config, ConfigDict
-from modelscope.utils.constant import CONFIGFILE, Tasks
+from modelscope.utils.constant import Tasks
 from modelscope.utils.registry import Registry, build_from_cfg
 from .base import Pipeline
-from .util import is_model_name

 PIPELINES = Registry('pipelines')

 DEFAULT_MODEL_FOR_PIPELINE = {
    # TaskName: (pipeline_module_name, model_repo)
-    Tasks.image_matting: ('image-matting', 'damo/image-matting-person'),
+    Tasks.word_segmentation:
+    ('structbert-chinese-word-segmentation',
+     'damo/nlp_structbert_word-segmentation_chinese-base'),
+    Tasks.sentence_similarity:
+    ('sbert-base-chinese-sentence-similarity',
+     'damo/nlp_structbert_sentence-similarity_chinese-base'),
+    Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
    Tasks.text_classification:
    ('bert-sentiment-analysis', 'damo/bert-base-sst2'),
-    Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'),
+    Tasks.text_generation: ('palm2.0',
+                            'damo/nlp_palm2.0_text-generation_chinese-base'),
    Tasks.image_captioning: ('ofa', None),
    Tasks.image_generation:
    ('person-image-cartoon',
--- a/modelscope/pipelines/cv/image_matting_pipeline.py
+++ b/modelscope/pipelines/cv/image_matting_pipeline.py
@@ -1,5 +1,5 @@
 import os.path as osp
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict

 import cv2
 import numpy as np
@@ -7,7 +7,7 @@ import PIL

 from modelscope.pipelines.base import Input
 from modelscope.preprocessors import load_image
-from modelscope.utils.constant import TF_GRAPH_FILE, Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from ..base import Pipeline
 from ..builder import PIPELINES
@@ -24,7 +24,7 @@ class ImageMattingPipeline(Pipeline):
        import tensorflow as tf
        if tf.__version__ >= '2.0':
            tf = tf.compat.v1
-        model_path = osp.join(self.model, TF_GRAPH_FILE)
+        model_path = osp.join(self.model, ModelFile.TF_GRAPH_FILE)

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
--- a/modelscope/pipelines/multi_modal/init.py
+++ b/modelscope/pipelines/multi_modal/init.py
@@ -1 +1 @@
-from .image_captioning import ImageCaptionPipeline
+from .image_caption_pipeline import ImageCaptionPipeline
--- a/modelscope/pipelines/multi_modal/image_caption_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_caption_pipeline.py
@@ -84,8 +84,11 @@ class ImageCaptionPipeline(Pipeline):
                s = torch.cat([s, self.eos_item])
            return s

-        patch_image = self.patch_resize_transform(
-            load_image(input)).unsqueeze(0)
+        if isinstance(input, Image.Image):
+            patch_image = self.patch_resize_transform(input).unsqueeze(0)
+        else:
+            patch_image = self.patch_resize_transform(
+                load_image(input)).unsqueeze(0)
        patch_mask = torch.tensor([True])
        text = 'what does the image describe?'
        src_text = encode_text(
--- a/modelscope/pipelines/nlp/init.py
+++ b/modelscope/pipelines/nlp/init.py
@@ -1,4 +1,6 @@
+from .sentence_similarity_pipeline import *  # noqa F403
 from .sequence_classification_pipeline import *  # noqa F403
 from .space.dialog_intent_prediction_pipeline import *  # noqa F403
 from .space.dialog_modeling_pipeline import *  # noqa F403
 from .text_generation_pipeline import *  # noqa F403
+from .word_segmentation_pipeline import *  # noqa F403
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
@@ -0,0 +1,62 @@
+from typing import Any, Dict, Union
+
+import numpy as np
+
+from modelscope.models.nlp import SbertForSentenceSimilarity
+from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from ...models import Model
+from ..base import Input, Pipeline
+from ..builder import PIPELINES
+
+__all__ = ['SentenceSimilarityPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.sentence_similarity,
+    module_name=r'sbert-base-chinese-sentence-similarity')
+class SentenceSimilarityPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[SbertForSentenceSimilarity, str],
+                 preprocessor: SequenceClassificationPreprocessor = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction
+
+        Args:
+            model (SbertForSentenceSimilarity): a model instance
+            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
+        """
+        assert isinstance(model, str) or isinstance(model, SbertForSentenceSimilarity), \
+            'model must be a single str or SbertForSentenceSimilarity'
+        sc_model = model if isinstance(
+            model,
+            SbertForSentenceSimilarity) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = SequenceClassificationPreprocessor(
+                sc_model.model_dir,
+                first_sequence='first_sequence',
+                second_sequence='second_sequence')
+        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
+
+        assert hasattr(self.model, 'id2label'), \
+            'id2label map should be initalizaed in init function.'
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        probs = inputs['probabilities'][0]
+        num_classes = probs.shape[0]
+        top_indices = np.argpartition(probs, -num_classes)[-num_classes:]
+        cls_ids = top_indices[np.argsort(-probs[top_indices], axis=-1)]
+        probs = probs[cls_ids].tolist()
+        cls_names = [self.model.id2label[cid] for cid in cls_ids]
+        b = 0
+        return {'scores': probs[b], 'labels': cls_names[b]}
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -1,8 +1,5 @@
-import os
-import uuid
 from typing import Any, Dict, Union

-import json
 import numpy as np

 from modelscope.models.nlp import BertForSequenceClassification
@@ -41,50 +38,29 @@ class SequenceClassificationPipeline(Pipeline):
                second_sequence=None)
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

-        from easynlp.utils import io
-        self.label_path = os.path.join(sc_model.model_dir,
-                                       'label_mapping.json')
-        with io.open(self.label_path) as f:
-            self.label_mapping = json.load(f)
-        self.label_id_to_name = {
-            idx: name
-            for name, idx in self.label_mapping.items()
-        }
+        assert hasattr(self.model, 'id2label'), \
+            'id2label map should be initalizaed in init function.'

-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    topk: int = 5) -> Dict[str, str]:
        """process the prediction results

        Args:
-            inputs (Dict[str, Any]): _description_
+            inputs (Dict[str, Any]): input data dict
+            topk (int): return topk classification result.

        Returns:
            Dict[str, str]: the prediction results
        """
+        # NxC np.ndarray
+        probs = inputs['probs'][0]
+        num_classes = probs.shape[0]
+        topk = min(topk, num_classes)
+        top_indices = np.argpartition(probs, -topk)[-topk:]
+        cls_ids = top_indices[np.argsort(probs[top_indices])]
+        probs = probs[cls_ids].tolist()

-        probs = inputs['probabilities']
-        logits = inputs['logits']
-        predictions = np.argsort(-probs, axis=-1)
-        preds = predictions[0]
-        b = 0
-        new_result = list()
-        for pred in preds:
-            new_result.append({
-                'pred': self.label_id_to_name[pred],
-                'prob': float(probs[b][pred]),
-                'logit': float(logits[b][pred])
-            })
-        new_results = list()
-        new_results.append({
-            'id':
-            inputs['id'][b] if 'id' in inputs else str(uuid.uuid4()),
-            'output':
-            new_result,
-            'predictions':
-            new_result[0]['pred'],
-            'probabilities':
-            ','.join([str(t) for t in inputs['probabilities'][b]]),
-            'logits':
-            ','.join([str(t) for t in inputs['logits'][b]])
-        })
+        cls_names = [self.model.id2label[cid] for cid in cls_ids]

-        return new_results[0]
+        return {'scores': probs, 'labels': cls_names}
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,7 +1,7 @@
 from typing import Dict, Optional, Union

 from modelscope.models import Model
-from modelscope.models.nlp import PalmForTextGenerationModel
+from modelscope.models.nlp import PalmForTextGeneration
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
 from ..base import Pipeline, Tensor
@@ -10,11 +10,11 @@ from ..builder import PIPELINES
 __all__ = ['TextGenerationPipeline']


-@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm')
+@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0')
 class TextGenerationPipeline(Pipeline):

    def __init__(self,
-                 model: Union[PalmForTextGenerationModel, str],
+                 model: Union[PalmForTextGeneration, str],
                 preprocessor: Optional[TextGenerationPreprocessor] = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
@@ -23,16 +23,16 @@ class TextGenerationPipeline(Pipeline):
            model (SequenceClassificationModel): a model instance
            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
        """
-        sc_model = model if isinstance(
-            model,
-            PalmForTextGenerationModel) else Model.from_pretrained(model)
+        model = model if isinstance(
+            model, PalmForTextGeneration) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TextGenerationPreprocessor(
-                sc_model.model_dir,
+                model.model_dir,
+                model.tokenizer,
                first_sequence='sentence',
                second_sequence=None)
-        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
-        self.tokenizer = preprocessor.tokenizer
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.tokenizer = model.tokenizer

    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
        """process the prediction results
@@ -43,17 +43,20 @@ class TextGenerationPipeline(Pipeline):
        Returns:
            Dict[str, str]: the prediction results
        """
+        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
+                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
+                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
+        replace_tokens_roberta = ((r' +', ' '), ('<mask>', '<q>'), ('<pad>',
+                                                                    ''),
+                                  ('<s>', ''), ('</s>', ''), ('<unk>', ' '))

-        vocab_size = len(self.tokenizer.vocab)
        pred_list = inputs['predictions']
        pred_ids = pred_list[0][0].cpu().numpy().tolist()
-        for j in range(len(pred_ids)):
-            if pred_ids[j] >= vocab_size:
-                pred_ids[j] = 100
-        pred = self.tokenizer.convert_ids_to_tokens(pred_ids)
-        pred_string = ''.join(pred).replace(
-            '##',
-            '').split('[SEP]')[0].replace('[CLS]',
-                                          '').replace('[SEP]',
-                                                      '').replace('[UNK]', '')
-        return {'pred_string': pred_string}
+        pred_string = self.tokenizer.decode(pred_ids)
+        for _old, _new in replace_tokens_bert:
+            pred_string = pred_string.replace(_old, _new)
+        pred_string.strip()
+        for _old, _new in replace_tokens_roberta:
+            pred_string = pred_string.replace(_old, _new)
+        pred_string.strip()
+        return {'text': pred_string}
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -0,0 +1,69 @@
+from typing import Any, Dict, Optional, Union
+
+from modelscope.models import Model
+from modelscope.models.nlp import StructBertForTokenClassification
+from modelscope.preprocessors import TokenClassifcationPreprocessor
+from modelscope.utils.constant import Tasks
+from ..base import Pipeline, Tensor
+from ..builder import PIPELINES
+
+__all__ = ['WordSegmentationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.word_segmentation,
+    module_name=r'structbert-chinese-word-segmentation')
+class WordSegmentationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[StructBertForTokenClassification, str],
+                 preprocessor: Optional[TokenClassifcationPreprocessor] = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
+
+        Args:
+            model (StructBertForTokenClassification): a model instance
+            preprocessor (TokenClassifcationPreprocessor): a preprocessor instance
+        """
+        model = model if isinstance(
+            model,
+            StructBertForTokenClassification) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = TokenClassifcationPreprocessor(model.model_dir)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.tokenizer = preprocessor.tokenizer
+        self.config = model.config
+        self.id2label = self.config.id2label
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        pred_list = inputs['predictions']
+        labels = []
+        for pre in pred_list:
+            labels.append(self.id2label[pre])
+        labels = labels[1:-1]
+        chunks = []
+        chunk = ''
+        assert len(inputs['text']) == len(labels)
+        for token, label in zip(inputs['text'], labels):
+            if label[0] == 'B' or label[0] == 'I':
+                chunk += token
+            else:
+                chunk += token
+                chunks.append(chunk)
+                chunk = ''
+        if chunk:
+            chunks.append(chunk)
+        seg_result = ' '.join(chunks)
+        rst = {
+            'output': seg_result,
+        }
+        return rst
--- a/modelscope/pipelines/outputs.py
+++ b/modelscope/pipelines/outputs.py
@@ -0,0 +1,117 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.utils.constant import Tasks
+
+TASK_OUTPUTS = {
+
+    # ============ vision tasks ===================
+
+    # image classification result for single sample
+    #   {
+    #       "labels": ["dog", "horse", "cow", "cat"],
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #   }
+    Tasks.image_classification: ['scores', 'labels'],
+    Tasks.image_tagging: ['scores', 'labels'],
+
+    # object detection result for single sample
+    #   {
+    #       "boxes": [
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #       ],
+    #       "labels": ["dog", "horse", "cow", "cat"],
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #   }
+    Tasks.object_detection: ['scores', 'labels', 'boxes'],
+
+    # instance segmentation result for single sample
+    #   {
+    #       "masks": [
+    #           np.array in bgr channel order
+    #       ],
+    #       "labels": ["dog", "horse", "cow", "cat"],
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #   }
+    Tasks.image_segmentation: ['scores', 'labels', 'boxes'],
+
+    # image generation/editing/matting result for single sample
+    # {
+    #   "output_png": np.array with shape(h, w, 4)
+    #                 for matting or (h, w, 3) for general purpose
+    # }
+    Tasks.image_editing: ['output_png'],
+    Tasks.image_matting: ['output_png'],
+    Tasks.image_generation: ['output_png'],
+
+    # pose estimation result for single sample
+    # {
+    #   "poses": np.array with shape [num_pose, num_keypoint, 3],
+    #       each keypoint is a array [x, y, score]
+    #   "boxes": np.array with shape [num_pose, 4], each box is
+    #       [x1, y1, x2, y2]
+    # }
+    Tasks.pose_estimation: ['poses', 'boxes'],
+
+    # ============ nlp tasks ===================
+
+    # text classification result for single sample
+    #   {
+    #       "labels": ["happy", "sad", "calm", "angry"],
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #   }
+    Tasks.text_classification: ['scores', 'labels'],
+
+    # text generation result for single sample
+    # {
+    #   "text": "this is text generated by a model."
+    # }
+    Tasks.text_generation: ['text'],
+
+    # word segmentation result for single sample
+    # {
+    #   "output": "今天 天气 不错 ， 适合 出去 游玩"
+    # }
+    Tasks.word_segmentation: ['output'],
+
+    # sentence similarity result for single sample
+    #   {
+    #       "labels": "1",
+    #       "scores": 0.9
+    #   }
+    Tasks.sentence_similarity: ['scores', 'labels'],
+
+    # ============ audio tasks ===================
+
+    # audio processed for single file in PCM format
+    # {
+    #   "output_pcm": np.array with shape(samples,) and dtype float32
+    # }
+    Tasks.speech_signal_process: ['output_pcm'],
+
+    # ============ multi-modal tasks ===================
+
+    # image caption result for single sample
+    # {
+    #   "caption": "this is an image caption text."
+    # }
+    Tasks.image_captioning: ['caption'],
+
+    # visual grounding result for single sample
+    # {
+    #       "boxes": [
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #       ],
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    # }
+    Tasks.visual_grounding: ['boxes', 'scores'],
+
+    # text_to_image result for a single sample
+    # {
+    #    "image": np.ndarray with shape [height, width, 3]
+    # }
+    Tasks.text_to_image_synthesis: ['image']
+}
--- a/modelscope/pipelines/util.py
+++ b/modelscope/pipelines/util.py
@@ -1,12 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import os.path as osp
 from typing import List, Union

-import json
 from maas_hub.file_download import model_file_download

-from modelscope.utils.constant import CONFIGFILE
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def is_config_has_model(cfg_file):
+    try:
+        cfg = Config.from_file(cfg_file)
+        return hasattr(cfg, 'model')
+    except Exception as e:
+        logger.error(f'parse config file {cfg_file} failed: {e}')
+        return False


 def is_model_name(model: Union[str, List]):
@@ -15,24 +26,17 @@ def is_model_name(model: Union[str, List]):

    def is_model_name_impl(model):
        if osp.exists(model):
-            if osp.exists(osp.join(model, CONFIGFILE)):
-                return True
+            cfg_file = osp.join(model, ModelFile.CONFIGURATION)
+            if osp.exists(cfg_file):
+                return is_config_has_model(cfg_file)
            else:
                return False
        else:
-            # try:
-            #     cfg_file = model_file_download(model, CONFIGFILE)
-            # except Exception:
-            #     cfg_file = None
-            # TODO @wenmeng.zwm use exception instead of
-            # following tricky logic
-            cfg_file = model_file_download(model, CONFIGFILE)
-            with open(cfg_file, 'r') as infile:
-                cfg = json.load(infile)
-            if 'Code' in cfg:
+            try:
+                cfg_file = model_file_download(model, ModelFile.CONFIGURATION)
+                return is_config_has_model(cfg_file)
+            except Exception:
                return False
-            else:
-                return True

    if isinstance(model, str):
        return is_model_name_impl(model)
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -1,10 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

+from .audio import LinearAECAndFbank
 from .base import Preprocessor
 from .builder import PREPROCESSORS, build_preprocessor
 from .common import Compose
 from .image import LoadImage, load_image
 from .nlp import *  # noqa F403
-from .nlp import TextGenerationPreprocessor
 from .space.dialog_intent_prediction_preprocessor import *  # noqa F403
 from .space.dialog_modeling_preprocessor import *  # noqa F403
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -0,0 +1,230 @@
+import ctypes
+import os
+from typing import Any, Dict
+
+import numpy as np
+import scipy.io.wavfile as wav
+import torch
+import torchaudio.compliance.kaldi as kaldi
+from numpy.ctypeslib import ndpointer
+
+from modelscope.utils.constant import Fields
+from .builder import PREPROCESSORS
+
+
+def load_wav(path):
+    samp_rate, data = wav.read(path)
+    return np.float32(data), samp_rate
+
+
+def load_library(libaec):
+    libaec_in_cwd = os.path.join('.', libaec)
+    if os.path.exists(libaec_in_cwd):
+        libaec = libaec_in_cwd
+    mitaec = ctypes.cdll.LoadLibrary(libaec)
+    fe_process = mitaec.fe_process_inst
+    fe_process.argtypes = [
+        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
+        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int,
+        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
+        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
+        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS')
+    ]
+    return fe_process
+
+
+def do_linear_aec(fe_process, mic, ref, int16range=True):
+    mic = np.float32(mic)
+    ref = np.float32(ref)
+    if len(mic) > len(ref):
+        mic = mic[:len(ref)]
+    out_mic = np.zeros_like(mic)
+    out_linear = np.zeros_like(mic)
+    out_echo = np.zeros_like(mic)
+    out_ref = np.zeros_like(mic)
+    if int16range:
+        mic /= 32768
+        ref /= 32768
+    fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo)
+    # out_ref not in use here
+    if int16range:
+        out_mic *= 32768
+        out_linear *= 32768
+        out_echo *= 32768
+    return out_mic, out_ref, out_linear, out_echo
+
+
+def load_kaldi_feature_transform(filename):
+    fp = open(filename, 'r')
+    all_str = fp.read()
+    pos1 = all_str.find('AddShift')
+    pos2 = all_str.find('[', pos1)
+    pos3 = all_str.find(']', pos2)
+    mean = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
+    pos1 = all_str.find('Rescale')
+    pos2 = all_str.find('[', pos1)
+    pos3 = all_str.find(']', pos2)
+    scale = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
+    fp.close()
+    return mean, scale
+
+
+class Feature:
+    r"""Extract feat from one utterance.
+    """
+
+    def __init__(self,
+                 fbank_config,
+                 feat_type='spec',
+                 mvn_file=None,
+                 cuda=False):
+        r"""
+
+        Args:
+            fbank_config (dict):
+            feat_type (str):
+                raw: do nothing
+                fbank: use kaldi.fbank
+                spec: Real/Imag
+                logpow: log(1+|x|^2)
+            mvn_file (str): the path of data file for mean variance normalization
+            cuda:
+        """
+        self.fbank_config = fbank_config
+        self.feat_type = feat_type
+        self.n_fft = fbank_config['frame_length'] * fbank_config[
+            'sample_frequency'] // 1000
+        self.hop_length = fbank_config['frame_shift'] * fbank_config[
+            'sample_frequency'] // 1000
+        self.window = torch.hamming_window(self.n_fft, periodic=False)
+
+        self.mvn = False
+        if mvn_file is not None and os.path.exists(mvn_file):
+            print(f'loading mvn file: {mvn_file}')
+            shift, scale = load_kaldi_feature_transform(mvn_file)
+            self.shift = torch.from_numpy(shift)
+            self.scale = torch.from_numpy(scale)
+            self.mvn = True
+        if cuda:
+            self.window = self.window.cuda()
+            if self.mvn:
+                self.shift = self.shift.cuda()
+                self.scale = self.scale.cuda()
+
+    def compute(self, utt):
+        r"""
+
+        Args:
+            utt: in [-32768, 32767] range
+
+        Returns:
+             [..., T, F]
+        """
+        if self.feat_type == 'raw':
+            return utt
+        elif self.feat_type == 'fbank':
+            if len(utt.shape) == 1:
+                utt = utt.unsqueeze(0)
+            feat = kaldi.fbank(utt, **self.fbank_config)
+        elif self.feat_type == 'spec':
+            spec = torch.stft(
+                utt / 32768,
+                self.n_fft,
+                self.hop_length,
+                self.n_fft,
+                self.window,
+                center=False,
+                return_complex=True)
+            feat = torch.cat([spec.real, spec.imag], dim=-2).permute(-1, -2)
+        elif self.feat_type == 'logpow':
+            spec = torch.stft(
+                utt,
+                self.n_fft,
+                self.hop_length,
+                self.n_fft,
+                self.window,
+                center=False,
+                return_complex=True)
+            abspow = torch.abs(spec)**2
+            feat = torch.log(1 + abspow).permute(-1, -2)
+        return feat
+
+    def normalize(self, feat):
+        if self.mvn:
+            feat = feat + self.shift
+            feat = feat * self.scale
+        return feat
+
+
+@PREPROCESSORS.register_module(Fields.audio)
+class LinearAECAndFbank:
+    SAMPLE_RATE = 16000
+
+    def __init__(self, io_config):
+        self.trunc_length = 7200 * self.SAMPLE_RATE
+        self.linear_aec_delay = io_config['linear_aec_delay']
+        self.feature = Feature(io_config['fbank_config'],
+                               io_config['feat_type'], io_config['mvn'])
+        self.mitaec = load_library(io_config['mitaec_library'])
+        self.mask_on_mic = io_config['mask_on'] == 'nearend_mic'
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """ linear filtering the near end mic and far end audio, then extract the feature
+        :param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech"
+        :return: dict with two keys and Tensor values: "base" linear filtered audio，and "feature"
+        """
+        # read files
+        nearend_mic, fs = load_wav(data['nearend_mic'])
+        assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
+        farend_speech, fs = load_wav(data['farend_speech'])
+        assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
+        if 'nearend_speech' in data:
+            nearend_speech, fs = load_wav(data['nearend_speech'])
+            assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
+        else:
+            nearend_speech = np.zeros_like(nearend_mic)
+
+        out_mic, out_ref, out_linear, out_echo = do_linear_aec(
+            self.mitaec, nearend_mic, farend_speech)
+        # fix 20ms linear aec delay by delaying the target speech
+        extra_zeros = np.zeros([int(self.linear_aec_delay * fs)])
+        nearend_speech = np.concatenate([extra_zeros, nearend_speech])
+        # truncate files to the same length
+        flen = min(
+            len(out_mic), len(out_ref), len(out_linear), len(out_echo),
+            len(nearend_speech))
+        fstart = 0
+        flen = min(flen, self.trunc_length)
+        nearend_mic, out_ref, out_linear, out_echo, nearend_speech = (
+            out_mic[fstart:flen], out_ref[fstart:flen],
+            out_linear[fstart:flen], out_echo[fstart:flen],
+            nearend_speech[fstart:flen])
+
+        # extract features (frames, [mic, linear, ref, aes?])
+        feat = torch.FloatTensor()
+
+        nearend_mic = torch.from_numpy(np.float32(nearend_mic))
+        fbank_nearend_mic = self.feature.compute(nearend_mic)
+        feat = torch.cat([feat, fbank_nearend_mic], dim=1)
+
+        out_linear = torch.from_numpy(np.float32(out_linear))
+        fbank_out_linear = self.feature.compute(out_linear)
+        feat = torch.cat([feat, fbank_out_linear], dim=1)
+
+        out_echo = torch.from_numpy(np.float32(out_echo))
+        fbank_out_echo = self.feature.compute(out_echo)
+        feat = torch.cat([feat, fbank_out_echo], dim=1)
+
+        # feature transform
+        feat = self.feature.normalize(feat)
+
+        # prepare target
+        if nearend_speech is not None:
+            nearend_speech = torch.from_numpy(np.float32(nearend_speech))
+
+        if self.mask_on_mic:
+            base = nearend_mic
+        else:
+            base = out_linear
+        out_data = {'base': base, 'target': nearend_speech, 'feature': feat}
+        return out_data
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -9,7 +9,7 @@ from modelscope.utils.constant import Fields
 from .builder import PREPROCESSORS


-@PREPROCESSORS.register_module(Fields.image)
+@PREPROCESSORS.register_module(Fields.cv)
 class LoadImage:
    """Load an image from file or url.
    Added or updated keys are "filename", "img", "img_shape",
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -11,8 +11,8 @@ from .base import Preprocessor
 from .builder import PREPROCESSORS

 __all__ = [
-    'Tokenize',
-    'SequenceClassificationPreprocessor',
+    'Tokenize', 'SequenceClassificationPreprocessor',
+    'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor'
 ]


@@ -31,7 +31,7 @@ class Tokenize(Preprocessor):


@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=r'bert-sentiment-analysis')
+    Fields.nlp, module_name=r'bert-sequence-classification')
 class SequenceClassificationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
@@ -51,21 +51,42 @@ class SequenceClassificationPreprocessor(Preprocessor):
        self.sequence_length = kwargs.pop('sequence_length', 128)

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+        print(f'this is the tokenzier {self.tokenizer}')

-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
+    @type_assert(object, (str, tuple))
+    def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
        """process the raw input data

        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
+            data (str or tuple):
+            sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+            or
+            (sentence1, sentence2)
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'

        Returns:
            Dict[str, Any]: the preprocessed data
        """

-        new_data = {self.first_sequence: data}
+        if not isinstance(data, tuple):
+            data = (
+                data,
+                None,
+            )
+
+        sentence1, sentence2 = data
+        new_data = {
+            self.first_sequence: sentence1,
+            self.second_sequence: sentence2
+        }
+
        # preprocess the data for the model input

        rst = {
@@ -94,17 +115,15 @@ class SequenceClassificationPreprocessor(Preprocessor):
        return rst


-@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm')
+@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0')
 class TextGenerationPreprocessor(Preprocessor):

-    def __init__(self, model_dir: str, *args, **kwargs):
+    def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
        """preprocess the data using the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """
-        from sofa import PalmTokenizer
-
        super().__init__(*args, **kwargs)

        self.model_dir: str = model_dir
@@ -113,7 +132,7 @@ class TextGenerationPreprocessor(Preprocessor):
        self.second_sequence: str = kwargs.pop('second_sequence',
                                               'second_sequence')
        self.sequence_length: int = kwargs.pop('sequence_length', 128)
-        self.tokenizer = PalmTokenizer.from_pretrained(model_dir)
+        self.tokenizer = tokenizer

    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
@@ -132,7 +151,7 @@ class TextGenerationPreprocessor(Preprocessor):
        new_data = {self.first_sequence: data}
        # preprocess the data for the model input

-        rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
+        rst = {'input_ids': [], 'attention_mask': []}

        max_seq_length = self.sequence_length

@@ -147,6 +166,53 @@ class TextGenerationPreprocessor(Preprocessor):

        rst['input_ids'].append(feature['input_ids'])
        rst['attention_mask'].append(feature['attention_mask'])
-        rst['token_type_ids'].append(feature['token_type_ids'])

        return {k: torch.tensor(v) for k, v in rst.items()}
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=r'bert-token-classification')
+class TokenClassifcationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        from sofa import SbertTokenizer
+        self.model_dir: str = model_dir
+        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        # preprocess the data for the model input
+
+        text = data.replace(' ', '').strip()
+        tokens = []
+        for token in text:
+            token = self.tokenizer.tokenize(token)
+            tokens.extend(token)
+        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
+        attention_mask = [1] * len(input_ids)
+        token_type_ids = [0] * len(input_ids)
+        return {
+            'text': text,
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'token_type_ids': token_type_ids
+        }
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -74,17 +74,17 @@ class Config:
        {'c': [1, 2, 3], 'd': 'dd'}
        >>> cfg.b.d
        'dd'
-        >>> cfg = Config.from_file('configs/examples/config.json')
+        >>> cfg = Config.from_file('configs/examples/configuration.json')
        >>> cfg.filename
-       'configs/examples/config.json'
+       'configs/examples/configuration.json'
        >>> cfg.b
        {'c': [1, 2, 3], 'd': 'dd'}
-        >>> cfg = Config.from_file('configs/examples/config.py')
+        >>> cfg = Config.from_file('configs/examples/configuration.py')
        >>> cfg.filename
-        "configs/examples/config.py"
-        >>> cfg = Config.from_file('configs/examples/config.yaml')
+        "configs/examples/configuration.py"
+        >>> cfg = Config.from_file('configs/examples/configuration.yaml')
        >>> cfg.filename
-        "configs/examples/config.yaml"
+        "configs/examples/configuration.yaml"
    """

    @staticmethod
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -4,8 +4,8 @@
 class Fields(object):
    """ Names for different application fields
    """
-    image = 'image'
-    video = 'video'
+    # image = 'image'
+    # video = 'video'
    cv = 'cv'
    nlp = 'nlp'
    audio = 'audio'
@@ -30,7 +30,9 @@ class Tasks(object):
    image_matting = 'image-matting'

    # nlp tasks
+    word_segmentation = 'word-segmentation'
    sentiment_analysis = 'sentiment-analysis'
+    sentence_similarity = 'sentence-similarity'
    text_classification = 'text-classification'
    relation_extraction = 'relation-extraction'
    zero_shot = 'zero-shot'
@@ -52,7 +54,7 @@ class Tasks(object):
    text_to_speech = 'text-to-speech'
    speech_signal_process = 'speech-signal-process'

-    # multi-media
+    # multi-modal tasks
    image_captioning = 'image-captioning'
    visual_grounding = 'visual-grounding'
    text_to_image_synthesis = 'text-to-image-synthesis'
@@ -73,16 +75,16 @@ class Hubs(object):
    huggingface = 'huggingface'


-# configuration filename
-# in order to avoid conflict with huggingface
-# config file we use maas_config instead
-CONFIGFILE = 'maas_config.json'
+class ModelFile(object):
+    CONFIGURATION = 'configuration.json'
+    README = 'README.md'
+    TF_SAVED_MODEL_FILE = 'saved_model.pb'
+    TF_GRAPH_FILE = 'tf_graph.pb'
+    TF_CHECKPOINT_FOLDER = 'tf_ckpts'
+    TF_CKPT_PREFIX = 'ckpt-'
+    TORCH_MODEL_FILE = 'pytorch_model.pt'
+    TORCH_MODEL_BIN_FILE = 'pytorch_model.bin'
+

-README_FILE = 'README.md'
-TF_SAVED_MODEL_FILE = 'saved_model.pb'
-TF_GRAPH_FILE = 'tf_graph.pb'
-TF_CHECKPOINT_FOLDER = 'tf_ckpts'
-TF_CHECKPOINT_FILE = 'checkpoint'
-TORCH_MODEL_FILE = 'pytorch_model.bin'
 TENSORFLOW = 'tensorflow'
 PYTORCH = 'pytorch'
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import inspect
-from email.policy import default

 from modelscope.utils.logger import get_logger

@@ -70,6 +69,7 @@ class Registry(object):
                           f'{self._name}[{group_key}]')

        self._modules[group_key][module_name] = module_cls
+        module_cls.group_key = group_key

        if module_name in self._modules[default_group]:
            if id(self._modules[default_group][module_name]) == id(module_cls):
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+
+TEST_LEVEL = 2
+TEST_LEVEL_STR = 'TEST_LEVEL'
+
+
+def test_level():
+    global TEST_LEVEL
+    if TEST_LEVEL_STR in os.environ:
+        TEST_LEVEL = int(os.environ[TEST_LEVEL_STR])
+
+    return TEST_LEVEL
+
+
+def set_test_level(level: int):
+    global TEST_LEVEL
+    TEST_LEVEL = level
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,6 +1,7 @@
 docutils==0.16.0
 recommonmark
 sphinx==4.0.2
+sphinx-book-theme
 sphinx-copybutton
 sphinx_markdown_tables
 sphinx_rtd_theme==0.5.2
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1 +1 @@
-https://alinlp.alibaba-inc.com/pypi/sofa-1.0.1.3-py3-none-any.whl
+https://alinlp.alibaba-inc.com/pypi/sofa-1.0.2-py3-none-any.whl
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,12 +1,13 @@
 addict
 datasets
 easydict
-https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl
+https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl
 numpy
 opencv-python-headless
-Pillow
+Pillow>=6.2.0
 pyyaml
 requests
+scipy
 tokenizers<=0.10.3
 transformers<=4.16.2
 yapf
--- a/setup.cfg
+++ b/setup.cfg
@@ -11,6 +11,7 @@ default_section = THIRDPARTY
 BASED_ON_STYLE = pep8
 BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
 SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
+SPLIT_BEFORE_ARITHMETIC_OPERATOR = true

 [codespell]
 skip = *.ipynb
@@ -20,5 +21,5 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
 [flake8]
 select = B,C,E,F,P,T4,W,B9
 max-line-length = 120
-ignore = F401,F821
+ignore = F401,F821,W503
 exclude = docs/src,*.pyi,.git
--- a/tests/pipelines/test_base.py
+++ b/tests/pipelines/test_base.py
@@ -35,9 +35,10 @@ class CustomPipelineTest(unittest.TestCase):
            CustomPipeline1()

    def test_custom(self):
+        dummy_task = 'dummy-task'

        @PIPELINES.register_module(
-            group_key=Tasks.image_tagging, module_name='custom-image')
+            group_key=dummy_task, module_name='custom-image')
        class CustomImagePipeline(Pipeline):

            def __init__(self,
@@ -67,32 +68,28 @@ class CustomPipelineTest(unittest.TestCase):
                    outputs['filename'] = inputs['url']
                img = inputs['img']
                new_image = img.resize((img.width // 2, img.height // 2))
-                outputs['resize_image'] = np.array(new_image)
-                outputs['dummy_result'] = 'dummy_result'
+                outputs['output_png'] = np.array(new_image)
                return outputs

            def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
                return inputs

        self.assertTrue('custom-image' in PIPELINES.modules[default_group])
-        add_default_pipeline_info(Tasks.image_tagging, 'custom-image')
+        add_default_pipeline_info(dummy_task, 'custom-image', overwrite=True)
        pipe = pipeline(pipeline_name='custom-image')
-        pipe2 = pipeline(Tasks.image_tagging)
+        pipe2 = pipeline(dummy_task)
        self.assertTrue(type(pipe) is type(pipe2))

-        img_url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.' \
-                  'aliyuncs.com/data/test/images/image1.jpg'
+        img_url = 'data/test/images/image1.jpg'
        output = pipe(img_url)
        self.assertEqual(output['filename'], img_url)
-        self.assertEqual(output['resize_image'].shape, (318, 512, 3))
-        self.assertEqual(output['dummy_result'], 'dummy_result')
+        self.assertEqual(output['output_png'].shape, (318, 512, 3))

        outputs = pipe([img_url for i in range(4)])
        self.assertEqual(len(outputs), 4)
        for out in outputs:
            self.assertEqual(out['filename'], img_url)
-            self.assertEqual(out['resize_image'].shape, (318, 512, 3))
-            self.assertEqual(out['dummy_result'], 'dummy_result')
+            self.assertEqual(out['output_png'].shape, (318, 512, 3))


 if __name__ == '__main__':
--- a/tests/pipelines/test_image_captioning.py
+++ b/tests/pipelines/test_image_captioning.py
@@ -7,11 +7,12 @@ import unittest
 from modelscope.fileio import File
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level


 class ImageCaptionTest(unittest.TestCase):

-    @unittest.skip('skip long test')
+    @unittest.skip('skip before model is restored in model hub')
    def test_run(self):
        model = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_large_best_clean.pt'

@@ -26,9 +27,7 @@ class ImageCaptionTest(unittest.TestCase):
            img_captioning = pipeline(
                Tasks.image_captioning, model=ofile.name, bpe_dir=bpe_dir)

-            result = img_captioning(
-                'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
-            )
+            result = img_captioning('data/test/images/image_matting.png')
            print(result['caption'])


--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -9,14 +9,15 @@ import cv2
 from modelscope.fileio import File
 from modelscope.pipelines import pipeline
 from modelscope.pydatasets import PyDataset
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import get_model_cache_dir
+from modelscope.utils.test_utils import test_level


 class ImageMattingTest(unittest.TestCase):

    def setUp(self) -> None:
-        self.model_id = 'damo/cv_unet_image-matting_damo'
+        self.model_id = 'damo/cv_unet_image-matting'
        # switch to False if downloading everytime is not desired
        purge_cache = True
        if purge_cache:
@@ -28,20 +29,17 @@ class ImageMattingTest(unittest.TestCase):
        model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs' \
                     '.com/data/test/maas/image_matting/matting_person.pb'
        with tempfile.TemporaryDirectory() as tmp_dir:
-            model_file = osp.join(tmp_dir, 'matting_person.pb')
+            model_file = osp.join(tmp_dir, ModelFile.TF_GRAPH_FILE)
            with open(model_file, 'wb') as ofile:
                ofile.write(File.read(model_path))
            img_matting = pipeline(Tasks.image_matting, model=tmp_dir)

-            result = img_matting(
-                'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
-            )
+            result = img_matting('data/test/images/image_matting.png')
            cv2.imwrite('result.png', result['output_png'])

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_dataset(self):
-        input_location = [
-            'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
-        ]
+        input_location = ['data/test/images/image_matting.png']
        # alternatively:
        # input_location = '/dir/to/images'

@@ -52,21 +50,19 @@ class ImageMattingTest(unittest.TestCase):
        cv2.imwrite('result.png', next(result)['output_png'])
        print(f'Output written to {osp.abspath("result.png")}')

+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_modelhub(self):
        img_matting = pipeline(Tasks.image_matting, model=self.model_id)

-        result = img_matting(
-            'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
-        )
+        result = img_matting('data/test/images/image_matting.png')
        cv2.imwrite('result.png', result['output_png'])
        print(f'Output written to {osp.abspath("result.png")}')

+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_modelhub_default_model(self):
        img_matting = pipeline(Tasks.image_matting)

-        result = img_matting(
-            'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
-        )
+        result = img_matting('data/test/images/image_matting.png')
        cv2.imwrite('result.png', result['output_png'])
        print(f'Output written to {osp.abspath("result.png")}')

--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -8,6 +8,7 @@ import cv2
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level


 class ImageCartoonTest(unittest.TestCase):
@@ -36,10 +37,12 @@ class ImageCartoonTest(unittest.TestCase):
        img_cartoon = pipeline(Tasks.image_generation, model=model_dir)
        self.pipeline_inference(img_cartoon, self.test_image)

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_modelhub(self):
        img_cartoon = pipeline(Tasks.image_generation, model=self.model_id)
        self.pipeline_inference(img_cartoon, self.test_image)

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_modelhub_default_model(self):
        img_cartoon = pipeline(Tasks.image_generation)
        self.pipeline_inference(img_cartoon, self.test_image)
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -0,0 +1,67 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from maas_hub.snapshot_download import snapshot_download
+
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForSentenceSimilarity
+from modelscope.pipelines import SentenceSimilarityPipeline, pipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import get_model_cache_dir
+from modelscope.utils.test_utils import test_level
+
+
+class SentenceSimilarityTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+    sentence1 = '今天气温比昨天高么？'
+    sentence2 = '今天湿度比昨天高么？'
+
+    def setUp(self) -> None:
+        # switch to False if downloading everytime is not desired
+        purge_cache = True
+        if purge_cache:
+            shutil.rmtree(
+                get_model_cache_dir(self.model_id), ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = SequenceClassificationPreprocessor(cache_path)
+        model = SbertForSentenceSimilarity(cache_path, tokenizer=tokenizer)
+        pipeline1 = SentenceSimilarityPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
+        print('test1')
+        print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
+              f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
+        print()
+        print(
+            f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
+            f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_similarity,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_similarity, model=self.model_id)
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.sentence_similarity)
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -0,0 +1,56 @@
+import os.path
+import shutil
+import unittest
+
+from modelscope.fileio import File
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import get_model_cache_dir
+
+NEAREND_MIC_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/nearend_mic.wav'
+FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/farend_speech.wav'
+NEAREND_MIC_FILE = 'nearend_mic.wav'
+FAREND_SPEECH_FILE = 'farend_speech.wav'
+
+AEC_LIB_URL = 'http://isv-data.oss-cn-hangzhou.aliyuncs.com/ics%2FMaaS%2FAEC%2Flib%2Flibmitaec_pyio.so' \
+              '?Expires=1664085465&OSSAccessKeyId=LTAIxjQyZNde90zh&Signature=Y7gelmGEsQAJRK4yyHSYMrdWizk%3D'
+AEC_LIB_FILE = 'libmitaec_pyio.so'
+
+
+def download(remote_path, local_path):
+    local_dir = os.path.dirname(local_path)
+    if len(local_dir) > 0:
+        if not os.path.exists(local_dir):
+            os.makedirs(local_dir)
+    with open(local_path, 'wb') as ofile:
+        ofile.write(File.read(remote_path))
+
+
+class SpeechSignalProcessTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/speech_dfsmn_aec_psm_16k'
+        # switch to False if downloading everytime is not desired
+        purge_cache = True
+        if purge_cache:
+            shutil.rmtree(
+                get_model_cache_dir(self.model_id), ignore_errors=True)
+        # A temporary hack to provide c++ lib. Download it first.
+        download(AEC_LIB_URL, AEC_LIB_FILE)
+
+    def test_run(self):
+        download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
+        download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
+        input = {
+            'nearend_mic': NEAREND_MIC_FILE,
+            'farend_speech': FAREND_SPEECH_FILE
+        }
+        aec = pipeline(
+            Tasks.speech_signal_process,
+            model=self.model_id,
+            pipeline_name=r'speech_dfsmn_aec_psm_16k')
+        aec(input, output_path='output.wav')
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -12,6 +12,7 @@ from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.pydatasets import PyDataset
 from modelscope.utils.constant import Hubs, Tasks
 from modelscope.utils.hub import get_model_cache_dir
+from modelscope.utils.test_utils import test_level


 class SequenceClassificationTest(unittest.TestCase):
@@ -43,6 +44,7 @@ class SequenceClassificationTest(unittest.TestCase):
                break
            print(r)

+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run(self):
        model_url = 'https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com' \
                    '/release/easynlp_modelzoo/alibaba-pai/bert-base-sst2.zip'
@@ -67,6 +69,7 @@ class SequenceClassificationTest(unittest.TestCase):
            Tasks.text_classification, model=model, preprocessor=preprocessor)
        print(pipeline2('Hello world!'))

+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        preprocessor = SequenceClassificationPreprocessor(
@@ -77,6 +80,7 @@ class SequenceClassificationTest(unittest.TestCase):
            preprocessor=preprocessor)
        self.predict(pipeline_ins)

+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_name(self):
        text_classification = pipeline(
            task=Tasks.text_classification, model=self.model_id)
@@ -85,6 +89,7 @@ class SequenceClassificationTest(unittest.TestCase):
                'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
        self.printDataset(result)

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_default_model(self):
        text_classification = pipeline(task=Tasks.text_classification)
        result = text_classification(
@@ -92,6 +97,7 @@ class SequenceClassificationTest(unittest.TestCase):
                'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
        self.printDataset(result)

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_dataset(self):
        model = Model.from_pretrained(self.model_id)
        preprocessor = SequenceClassificationPreprocessor(
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -4,47 +4,75 @@ import unittest
 from maas_hub.snapshot_download import snapshot_download

 from modelscope.models import Model
-from modelscope.models.nlp import PalmForTextGenerationModel
+from modelscope.models.nlp import PalmForTextGeneration
 from modelscope.pipelines import TextGenerationPipeline, pipeline
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level


 class TextGenerationTest(unittest.TestCase):
-    model_id = 'damo/nlp_palm_text-generation_chinese'
-    input1 = "今日天气类型='晴'&温度变化趋势='大幅上升'&最低气温='28℃'&最高气温='31℃'&体感='湿热'"
-    input2 = "今日天气类型='多云'&体感='舒适'&最低气温='26℃'&最高气温='30℃'"
+    model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base'
+    model_id_en = 'damo/nlp_palm2.0_text-generation_english-base'
+    input_zh = """
+    本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：
+    1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代
+    """
+    input_en = """
+    The Director of Public Prosecutions who let off Lord Janner over alleged child sex abuse started
+    her career at a legal chambers when the disgraced Labour peer was a top QC there . Alison Saunders ,
+    54 , sparked outrage last week when she decided the 86-year-old should not face astring of charges
+    of paedophilia against nine children because he has dementia . Today , newly-released documents
+    revealed damning evidence that abuse was covered up by police andsocial workers for more than 20 years .
+    And now it has emerged Mrs Saunders ' law career got off to a flying start when she secured her
+    pupillage -- a barrister 's training contract at 1 Garden Court Chambers in London in 1983 .
+    """

-    @unittest.skip('skip temporarily to save test time')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run(self):
-        cache_path = snapshot_download(self.model_id)
-        preprocessor = TextGenerationPreprocessor(
-            cache_path, first_sequence='sentence', second_sequence=None)
-        model = PalmForTextGenerationModel(
-            cache_path, tokenizer=preprocessor.tokenizer)
-        pipeline1 = TextGenerationPipeline(model, preprocessor)
-        pipeline2 = pipeline(
-            Tasks.text_generation, model=model, preprocessor=preprocessor)
-        print(f'input: {self.input1}\npipeline1: {pipeline1(self.input1)}')
-        print()
-        print(f'input: {self.input2}\npipeline2: {pipeline2(self.input2)}')
+        for model_id, input in ((self.model_id_zh, self.input_zh),
+                                (self.model_id_en, self.input_en)):
+            cache_path = snapshot_download(model_id)
+            model = PalmForTextGeneration(cache_path)
+            preprocessor = TextGenerationPreprocessor(
+                cache_path,
+                model.tokenizer,
+                first_sequence='sentence',
+                second_sequence=None)
+            pipeline1 = TextGenerationPipeline(model, preprocessor)
+            pipeline2 = pipeline(
+                Tasks.text_generation, model=model, preprocessor=preprocessor)
+            print(
+                f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
+            )

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
-        preprocessor = TextGenerationPreprocessor(
-            model.model_dir, first_sequence='sentence', second_sequence=None)
-        pipeline_ins = pipeline(
-            task=Tasks.text_generation, model=model, preprocessor=preprocessor)
-        print(pipeline_ins(self.input1))
+        for model_id, input in ((self.model_id_zh, self.input_zh),
+                                (self.model_id_en, self.input_en)):
+            model = Model.from_pretrained(model_id)
+            preprocessor = TextGenerationPreprocessor(
+                model.model_dir,
+                model.tokenizer,
+                first_sequence='sentence',
+                second_sequence=None)
+            pipeline_ins = pipeline(
+                task=Tasks.text_generation,
+                model=model,
+                preprocessor=preprocessor)
+            print(pipeline_ins(input))

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_model_name(self):
-        pipeline_ins = pipeline(
-            task=Tasks.text_generation, model=self.model_id)
-        print(pipeline_ins(self.input2))
+        for model_id, input in ((self.model_id_zh, self.input_zh),
+                                (self.model_id_en, self.input_en)):
+            pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
+            print(pipeline_ins(input))

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_default_model(self):
        pipeline_ins = pipeline(task=Tasks.text_generation)
-        print(pipeline_ins(self.input2))
+        print(pipeline_ins(self.input_zh))


 if __name__ == '__main__':
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import shutil
+import unittest
+
+from maas_hub.snapshot_download import snapshot_download
+
+from modelscope.models import Model
+from modelscope.models.nlp import StructBertForTokenClassification
+from modelscope.pipelines import WordSegmentationPipeline, pipeline
+from modelscope.preprocessors import TokenClassifcationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import get_model_cache_dir
+from modelscope.utils.test_utils import test_level
+
+
+class WordSegmentationTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
+    sentence = '今天天气不错，适合出去游玩'
+
+    def setUp(self) -> None:
+        # switch to False if downloading everytime is not desired
+        purge_cache = True
+        if purge_cache:
+            shutil.rmtree(
+                get_model_cache_dir(self.model_id), ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = TokenClassifcationPreprocessor(cache_path)
+        model = StructBertForTokenClassification(
+            cache_path, tokenizer=tokenizer)
+        pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.word_segmentation, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = TokenClassifcationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.word_segmentation, model=self.model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.word_segmentation)
+        print(pipeline_ins(input=self.sentence))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/preprocessors/test_image.py
+++ b/tests/preprocessors/test_image.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import PIL
+
+from modelscope.preprocessors import load_image
+from modelscope.utils.logger import get_logger
+
+
+class ImagePreprocessorTest(unittest.TestCase):
+
+    def test_load(self):
+        img = load_image('data/test/images/image_matting.png')
+        self.assertTrue(isinstance(img, PIL.Image.Image))
+        self.assertEqual(img.size, (948, 533))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/run.py
+++ b/tests/run.py
@@ -7,6 +7,11 @@ import sys
 import unittest
 from fnmatch import fnmatch

+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import set_test_level, test_level
+
+logger = get_logger()
+

 def gather_test_cases(test_dir, pattern, list_tests):
    case_list = []
@@ -49,5 +54,9 @@ if __name__ == '__main__':
        '--pattern', default='test_*.py', help='test file pattern')
    parser.add_argument(
        '--test_dir', default='tests', help='directory to be tested')
+    parser.add_argument(
+        '--level', default=0, help='2 -- all, 1 -- p1, 0 -- p0')
    args = parser.parse_args()
+    set_test_level(args.level)
+    logger.info(f'TEST LEVEL: {test_level()}')
    main(args)
--- a/tests/utils/test_config.py
+++ b/tests/utils/test_config.py
@@ -1,11 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import argparse
-import os.path as osp
 import tempfile
 import unittest
-from pathlib import Path

-from modelscope.fileio import dump, load
 from modelscope.utils.config import Config

 obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
@@ -14,25 +11,25 @@ obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
 class ConfigTest(unittest.TestCase):

    def test_json(self):
-        config_file = 'configs/examples/config.json'
+        config_file = 'configs/examples/configuration.json'
        cfg = Config.from_file(config_file)
        self.assertEqual(cfg.a, 1)
        self.assertEqual(cfg.b, obj['b'])

    def test_yaml(self):
-        config_file = 'configs/examples/config.yaml'
+        config_file = 'configs/examples/configuration.yaml'
        cfg = Config.from_file(config_file)
        self.assertEqual(cfg.a, 1)
        self.assertEqual(cfg.b, obj['b'])

    def test_py(self):
-        config_file = 'configs/examples/config.py'
+        config_file = 'configs/examples/configuration.py'
        cfg = Config.from_file(config_file)
        self.assertEqual(cfg.a, 1)
        self.assertEqual(cfg.b, obj['b'])

    def test_dump(self):
-        config_file = 'configs/examples/config.py'
+        config_file = 'configs/examples/configuration.py'
        cfg = Config.from_file(config_file)
        self.assertEqual(cfg.a, 1)
        self.assertEqual(cfg.b, obj['b'])
@@ -53,7 +50,7 @@ class ConfigTest(unittest.TestCase):
                self.assertEqual(yaml_str, infile.read())

    def test_to_dict(self):
-        config_file = 'configs/examples/config.json'
+        config_file = 'configs/examples/configuration.json'
        cfg = Config.from_file(config_file)
        d = cfg.to_dict()
        print(d)
				`@@ -0,0 +1 @@`
				`from .linear_aec_pipeline import LinearAECPipeline`