Merge commit 'e71057cacf6b9f977e20f41e6d767b4ad1557171'

* commit 'e71057cacf6b9f977e20f41e6d767b4ad1557171': feat(audio/ans): Add ZipEnhancer and related layers for acoustic nois… (#1019) Fix timestamp in docker build (#1049) Fix pypi mirror (#1048) Fix build error (#1047) hotfix for datasets 3.0.2 (#1046) Update docker scripts (#1044) Add docker workflow name (#1043)
2026-02-24 20:19:51 +01:00 · 2024-10-24 20:42:11 +08:00
parent 39caf0340a e71057cacf
commit ff10f66585
13 changed files with 3291 additions and 20 deletions
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -1,24 +1,30 @@
-name: Build Docker Images
+name: Build Docker Image

 on:
  workflow_dispatch:
    inputs:
+      workflow_name:
+        description: 'The specific name of this build'
+        required: true
+        default: 'build'
      modelscope_branch:
-        description: 'ModelScope branch to build from'
+        description: 'ModelScope branch to build from(release/x.xx)'
        required: true
      image_type:
-        description: 'The image type to build'
+        description: 'The image type to build(cpu/gpu/llm)'
        required: true
      modelscope_version:
-        description: 'ModelScope version to use'
+        description: 'ModelScope version to use(x.xx.x)'
        required: true
      swift_branch:
-        description: 'SWIFT branch to use'
+        description: 'SWIFT branch to use(release/x.xx)'
        required: true
      other_params:
        description: 'Other params in --xxx xxx'
        required: false

+run-name: Docker-${{ inputs.modelscope_branch }}-${{ inputs.image_type }}-${{ inputs.workflow_name }}-by-@${{ github.actor }}
+
 jobs:
  build:
    runs-on: [modelscope-self-hosted-us]
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -12,10 +12,6 @@ RUN apt-get update && \

 {extra_content}

-RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
-    pip config set install.trusted-host mirrors.aliyun.com && \
-    cp /tmp/resources/ubuntu2204.aliyun /etc/apt/sources.list
-
 COPY {meta_file} /tmp/install.sh

 RUN sh /tmp/install.sh {version_args}
@@ -28,6 +24,10 @@ RUN cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b  {modelscope_branch}  --single

 RUN cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b {swift_branch}  --single-branch https://github.com/modelscope/ms-swift.git && cd ms-swift && pip install .[all] && cd / && rm -fr /tmp/ms-swift && pip cache purge;

+RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
+    pip config set install.trusted-host mirrors.aliyun.com && \
+    cp /tmp/resources/ubuntu2204.aliyun /etc/apt/sources.list
+
 ENV SETUPTOOLS_USE_DISTUTILS=stdlib
 ENV VLLM_USE_MODELSCOPE=True
 ENV LMDEPLOY_USE_MODELSCOPE=True
--- a/docker/build_image.py
+++ b/docker/build_image.py
@@ -1,9 +1,12 @@
 import argparse
 import os
+from datetime import datetime
 from typing import Any

 docker_registry = os.environ['DOCKER_REGISTRY']
 assert docker_registry, 'You must pass a valid DOCKER_REGISTRY'
+timestamp = datetime.now()
+formatted_time = timestamp.strftime('%Y%m%d%H%M%S')


 class Builder:
@@ -85,12 +88,16 @@ class BaseCPUImageBuilder(Builder):
        return content

    def build(self):
-        image_tag = f'{docker_registry}:ubuntu{self.args.ubuntu_version}-torch{self.args.torch_version}-base'
+        image_tag = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
+            f'torch{self.args.torch_version}-base')
        return os.system(
            f'DOCKER_BUILDKIT=0 docker build -t {image_tag} -f Dockerfile .')

    def push(self):
-        image_tag = f'{docker_registry}:ubuntu{self.args.ubuntu_version}-torch{self.args.torch_version}-base'
+        image_tag = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
+            f'torch{self.args.torch_version}-base')
        return os.system(f'docker push {image_tag}')


@@ -110,14 +117,14 @@ class BaseGPUImageBuilder(Builder):

    def build(self) -> int:
        image_tag = (
-            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
            f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
        return os.system(
            f'DOCKER_BUILDKIT=0 docker build -t {image_tag} -f Dockerfile .')

    def push(self):
        image_tag = (
-            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
            f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
        return os.system(f'docker push {image_tag}')

@@ -129,7 +136,9 @@ class CPUImageBuilder(Builder):
        version_args = (
            f'{self.args.torch_version} {self.args.torchvision_version} '
            f'{self.args.torchaudio_version}')
-        base_image = f'{docker_registry}:ubuntu{self.args.ubuntu_version}-torch{self.args.torch_version}-base'
+        base_image = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}'
+            f'-torch{self.args.torch_version}-base')
        extra_content = """\nRUN pip install adaseq\nRUN pip install pai-easycv"""

        with open('docker/Dockerfile.ubuntu', 'r') as f:
@@ -157,7 +166,17 @@ class CPUImageBuilder(Builder):
            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
            f'torch{self.args.torch_version}-{self.args.modelscope_version}-test'
        )
-        return os.system(f'docker push {image_tag}')
+        ret = os.system(f'docker push {image_tag}')
+        if ret != 0:
+            return ret
+        image_tag2 = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
+            f'torch{self.args.torch_version}-{self.args.modelscope_version}-{formatted_time}-test'
+        )
+        ret = os.system(f'docker tag {image_tag} {image_tag2}')
+        if ret != 0:
+            return ret
+        return os.system(f'docker push {image_tag2}')


 class GPUImageBuilder(Builder):
@@ -170,7 +189,7 @@ class GPUImageBuilder(Builder):
            f'{self.args.vllm_version} {self.args.lmdeploy_version} {self.args.autogptq_version}'
        )
        base_image = (
-            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
            f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
        with open('docker/Dockerfile.ubuntu', 'r') as f:
            content = f.read()
@@ -196,7 +215,17 @@ class GPUImageBuilder(Builder):
            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
            f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-'
            f'{self.args.modelscope_version}-test')
-        return os.system(f'docker push {image_tag}')
+        ret = os.system(f'docker push {image_tag}')
+        if ret != 0:
+            return ret
+        image_tag2 = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
+            f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-'
+            f'{self.args.modelscope_version}-{formatted_time}-test')
+        ret = os.system(f'docker tag {image_tag} {image_tag2}')
+        if ret != 0:
+            return ret
+        return os.system(f'docker push {image_tag2}')


 class LLMImageBuilder(Builder):
@@ -253,7 +282,17 @@ class LLMImageBuilder(Builder):
            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
            f'{self.args.python_tag}-torch{self.args.torch_version}-{self.args.modelscope_version}-LLM-test'
        )
-        return os.system(f'docker push {image_tag}')
+        ret = os.system(f'docker push {image_tag}')
+        if ret != 0:
+            return ret
+        image_tag2 = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
+            f'{self.args.python_tag}-torch{self.args.torch_version}-'
+            f'{self.args.modelscope_version}-LLM-{formatted_time}-test')
+        ret = os.system(f'docker tag {image_tag} {image_tag2}')
+        if ret != 0:
+            return ret
+        return os.system(f'docker push {image_tag2}')


 parser = argparse.ArgumentParser()
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -193,6 +193,7 @@ class Models(object):
    # audio models
    sambert_hifigan = 'sambert-hifigan'
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+    speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base'
    speech_dfsmn_ans = 'speech_dfsmn_ans'
    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
    speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot'
@@ -551,6 +552,7 @@ class Pipelines(object):
    sambert_hifigan_tts = 'sambert-hifigan-tts'
    speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+    speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base'
    speech_dfsmn_ans_psm_48k_causal = 'speech_dfsmn_ans_psm_48k_causal'
    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
    speech_separation = 'speech-separation'
--- a/modelscope/models/audio/ans/zipenhancer.py
+++ b/modelscope/models/audio/ans/zipenhancer.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import random
+from typing import Dict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from .zipenhancer_layers.generator import (DenseEncoder, MappingDecoder,
+                                           PhaseDecoder)
+from .zipenhancer_layers.scaling import ScheduledFloat
+from .zipenhancer_layers.zipenhancer_layer import Zipformer2DualPathEncoder
+
+
+@MODELS.register_module(
+    Tasks.acoustic_noise_suppression,
+    module_name=Models.speech_zipenhancer_ans_multiloss_16k_base)
+class ZipenhancerDecorator(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+        h = dict(
+            num_tsconformers=kwargs['num_tsconformers'],
+            dense_channel=kwargs['dense_channel'],
+            former_conf=kwargs['former_conf'],
+            batch_first=kwargs['batch_first'],
+            model_num_spks=kwargs['model_num_spks'],
+        )
+        # num_tsconformers, dense_channel, former_name, former_conf, batch_first, model_num_spks
+
+        h = AttrDict(h)
+        self.model = ZipEnhancer(h)
+        model_bin_file = os.path.join(model_dir,
+                                      ModelFile.TORCH_MODEL_BIN_FILE)
+        if os.path.exists(model_bin_file):
+            checkpoint = torch.load(
+                model_bin_file, map_location=torch.device('cpu'))
+            if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+                # the new trained model by user is based on ZipenhancerDecorator
+                self.load_state_dict(checkpoint['state_dict'])
+            else:
+                # The released model on Modelscope is based on Zipenhancer
+                # self.model.load_state_dict(checkpoint, strict=False)
+                self.model.load_state_dict(checkpoint['generator'])
+                # print(checkpoint['generator'].keys())
+
+    def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        n_fft = 400
+        hop_size = 100
+        win_size = 400
+        noisy_wav = inputs['noisy']
+        norm_factor = torch.sqrt(noisy_wav.shape[1]
+                                 / torch.sum(noisy_wav**2.0))
+        noisy_audio = (noisy_wav * norm_factor)
+
+        mag, pha, com = mag_pha_stft(
+            noisy_audio,
+            n_fft,
+            hop_size,
+            win_size,
+            compress_factor=0.3,
+            center=True)
+        amp_g, pha_g, com_g, _, others = self.model.forward(mag, pha)
+        wav = mag_pha_istft(
+            amp_g,
+            pha_g,
+            n_fft,
+            hop_size,
+            win_size,
+            compress_factor=0.3,
+            center=True)
+
+        wav = wav / norm_factor
+
+        output = {
+            'wav_l2': wav,
+        }
+
+        return output
+
+
+class ZipEnhancer(nn.Module):
+
+    def __init__(self, h):
+        """
+        Initialize the ZipEnhancer module.
+
+        Args:
+        h (object): Configuration object containing various hyperparameters and settings.
+        having num_tsconformers, former_name, former_conf, mask_decoder_type, ...
+        """
+        super(ZipEnhancer, self).__init__()
+        self.h = h
+
+        num_tsconformers = h.num_tsconformers
+        self.num_tscblocks = num_tsconformers
+        self.dense_encoder = DenseEncoder(h, in_channel=2)
+
+        self.TSConformer = Zipformer2DualPathEncoder(
+            output_downsampling_factor=1,
+            dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+            **h.former_conf)
+
+        self.mask_decoder = MappingDecoder(h, out_channel=h.model_num_spks)
+        self.phase_decoder = PhaseDecoder(h, out_channel=h.model_num_spks)
+
+    def forward(self, noisy_mag, noisy_pha):  # [B, F, T]
+        """
+        Forward pass of the ZipEnhancer module.
+
+        Args:
+        noisy_mag (Tensor): Noisy magnitude input tensor of shape [B, F, T].
+        noisy_pha (Tensor): Noisy phase input tensor of shape [B, F, T].
+
+        Returns:
+        Tuple: denoised magnitude, denoised phase, denoised complex representation,
+               (optional) predicted noise components, and other auxiliary information.
+        """
+        others = dict()
+
+        noisy_mag = noisy_mag.unsqueeze(-1).permute(0, 3, 2, 1)  # [B, 1, T, F]
+        noisy_pha = noisy_pha.unsqueeze(-1).permute(0, 3, 2, 1)  # [B, 1, T, F]
+        x = torch.cat((noisy_mag, noisy_pha), dim=1)  # [B, 2, T, F]
+        x = self.dense_encoder(x)
+
+        # [B, C, T, F]
+        x = self.TSConformer(x)
+
+        pred_mag = self.mask_decoder(x)
+        pred_pha = self.phase_decoder(x)
+        # b, c, t, f -> b, 1, t, f -> b, f, t, 1 -> b, f, t
+        denoised_mag = pred_mag[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
+                                                                 1).squeeze(-1)
+
+        # b, t, f
+        denoised_pha = pred_pha[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
+                                                                 1).squeeze(-1)
+        # b, t, f
+        denoised_com = torch.stack((denoised_mag * torch.cos(denoised_pha),
+                                    denoised_mag * torch.sin(denoised_pha)),
+                                   dim=-1)
+
+        return denoised_mag, denoised_pha, denoised_com, None, others
+
+
+class AttrDict(dict):
+
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def mag_pha_stft(y,
+                 n_fft,
+                 hop_size,
+                 win_size,
+                 compress_factor=1.0,
+                 center=True):
+    hann_window = torch.hann_window(win_size, device=y.device)
+    stft_spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=center,
+        pad_mode='reflect',
+        normalized=False,
+        return_complex=True)
+    stft_spec = torch.view_as_real(stft_spec)
+    mag = torch.sqrt(stft_spec.pow(2).sum(-1) + (1e-9))
+    pha = torch.atan2(stft_spec[:, :, :, 1], stft_spec[:, :, :, 0] + (1e-5))
+    # Magnitude Compression
+    mag = torch.pow(mag, compress_factor)
+    com = torch.stack((mag * torch.cos(pha), mag * torch.sin(pha)), dim=-1)
+
+    return mag, pha, com
+
+
+def mag_pha_istft(mag,
+                  pha,
+                  n_fft,
+                  hop_size,
+                  win_size,
+                  compress_factor=1.0,
+                  center=True):
+    # Magnitude Decompression
+    mag = torch.pow(mag, (1.0 / compress_factor))
+    com = torch.complex(mag * torch.cos(pha), mag * torch.sin(pha))
+    hann_window = torch.hann_window(win_size, device=com.device)
+
+    wav = torch.istft(
+        com,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=center)
+    return wav
--- a/modelscope/models/audio/ans/zipenhancer_layers/generator.py
+++ b/modelscope/models/audio/ans/zipenhancer_layers/generator.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed and modified from MP-SENet,
+# public available at https://github.com/yxlu-0102/MP-SENet
+
+import random
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SubPixelConvTranspose2d(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=(1, 3),
+                 stride=(1, 2),
+                 padding=(0, 1)):
+        super(SubPixelConvTranspose2d, self).__init__()
+        self.upscale_width_factor = stride[1]
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels * self.upscale_width_factor,
+            kernel_size=kernel_size,
+            padding=padding)  # only change the width
+
+    def forward(self, x):
+
+        b, c, t, f = x.size()
+        # Use conv1 for upsampling, followed by expansion only in the width dimension.
+        x = self.conv1(x)
+        # print(x.size())
+        # Note: Here we do not directly use PixelShuffle because we only intend to expand in the width dimension,
+        # whereas PixelShuffle operates simultaneously on both height and width, hence we manually adjust accordingly.
+        # b, 2c, t, f
+        # print(x.size())
+        x = x.view(b, c, self.upscale_width_factor, t,
+                   f).permute(0, 1, 3, 4, 2).contiguous()
+        # b, c, 2, t, f -> b, c, t, f, 2
+        x = x.view(b, c, t, f * self.upscale_width_factor)
+        # b, c, t, 2f = 202
+        # x = nn.functional.pad(x, (0, 1))
+        # b, c, t, 2f = 202
+
+        return x
+
+
+class DenseBlockV2(nn.Module):
+    """
+    A denseblock for ZipEnhancer
+    """
+
+    def __init__(self, h, kernel_size=(2, 3), depth=4):
+        super(DenseBlockV2, self).__init__()
+        self.h = h
+        self.depth = depth
+        self.dense_block = nn.ModuleList([])
+        for i in range(depth):
+            dil = 2**i
+            pad_length = kernel_size[0] + (dil - 1) * (kernel_size[0] - 1) - 1
+            dense_conv = nn.Sequential(
+                nn.ConstantPad2d((1, 1, pad_length, 0), value=0.),
+                nn.Conv2d(
+                    h.dense_channel * (i + 1),
+                    h.dense_channel,
+                    kernel_size,
+                    dilation=(dil, 1)),
+                # nn.Conv2d(h.dense_channel * (i + 1), h.dense_channel, kernel_size, dilation=(dil, 1),
+                #           padding=get_padding_2d(kernel_size, (dil, 1))),
+                nn.InstanceNorm2d(h.dense_channel, affine=True),
+                nn.PReLU(h.dense_channel))
+            self.dense_block.append(dense_conv)
+
+    def forward(self, x):
+        skip = x
+        # b, c, t, f
+        for i in range(self.depth):
+            _x = skip
+            x = self.dense_block[i](_x)
+            # print(x.size())
+            skip = torch.cat([x, skip], dim=1)
+        return x
+
+
+class DenseEncoder(nn.Module):
+
+    def __init__(self, h, in_channel):
+        """
+        Initialize the DenseEncoder module.
+
+        Args:
+        h (object): Configuration object containing various hyperparameters and settings.
+        in_channel (int): Number of input channels. Example: mag + phase: 2 channels
+        """
+        super(DenseEncoder, self).__init__()
+        self.h = h
+        self.dense_conv_1 = nn.Sequential(
+            nn.Conv2d(in_channel, h.dense_channel, (1, 1)),
+            nn.InstanceNorm2d(h.dense_channel, affine=True),
+            nn.PReLU(h.dense_channel))
+
+        self.dense_block = DenseBlockV2(h, depth=4)
+
+        encoder_pad_kersize = (0, 1)
+        # Here pad was originally (0, 0)，now change to (0, 1)
+        self.dense_conv_2 = nn.Sequential(
+            nn.Conv2d(
+                h.dense_channel,
+                h.dense_channel, (1, 3), (1, 2),
+                padding=encoder_pad_kersize),
+            nn.InstanceNorm2d(h.dense_channel, affine=True),
+            nn.PReLU(h.dense_channel))
+
+    def forward(self, x):
+        """
+        Forward pass of the DenseEncoder module.
+
+        Args:
+        x (Tensor): Input tensor of shape [B, C=in_channel, T, F].
+
+        Returns:
+        Tensor: Output tensor after passing through the dense encoder. Maybe: [B, C=dense_channel, T, F // 2].
+        """
+        # print("x: {}".format(x.size()))
+        x = self.dense_conv_1(x)  # [b, 64, T, F]
+        if self.dense_block is not None:
+            x = self.dense_block(x)  # [b, 64, T, F]
+        x = self.dense_conv_2(x)  # [b, 64, T, F//2]
+        return x
+
+
+class BaseDecoder(nn.Module):
+
+    def __init__(self, h):
+        """
+        Initialize the BaseDecoder module.
+
+        Args:
+        h (object): Configuration object containing various hyperparameters and settings.
+        including upsample_type, dense_block_type.
+        """
+        super(BaseDecoder, self).__init__()
+
+        self.upsample_module_class = SubPixelConvTranspose2d
+
+        # for both mag and phase decoder
+        self.dense_block = DenseBlockV2(h, depth=4)
+
+
+class MappingDecoder(BaseDecoder):
+
+    def __init__(self, h, out_channel=1):
+        """
+        Initialize the MappingDecoderV3 module.
+
+        Args:
+        h (object): Configuration object containing various hyperparameters and settings.
+        out_channel (int): Number of output channels. Default is 1. The number of output spearkers.
+        """
+        super(MappingDecoder, self).__init__(h)
+        decoder_final_kersize = (1, 2)
+
+        self.mask_conv = nn.Sequential(
+            self.upsample_module_class(h.dense_channel, h.dense_channel,
+                                       (1, 3), (1, 2)),
+            # nn.Conv2d(h.dense_channel, out_channel, (1, 1)),
+            nn.InstanceNorm2d(h.dense_channel, affine=True),
+            nn.PReLU(h.dense_channel),
+            nn.Conv2d(h.dense_channel, out_channel, decoder_final_kersize))
+        # Upsample at F dimension
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """
+        Forward pass of the MappingDecoderV3 module.
+
+        Args:
+        x (Tensor): Input tensor. [B, C, T, F]
+
+        Returns:
+        Tensor: Output tensor after passing through the decoder. [B, Num_Spks, T, F]
+        """
+        if self.dense_block is not None:
+            x = self.dense_block(x)
+        x = self.mask_conv(x)
+        x = self.relu(x)
+        # b, c=1, t, f
+        return x
+
+
+class PhaseDecoder(BaseDecoder):
+
+    def __init__(self, h, out_channel=1):
+        super(PhaseDecoder, self).__init__(h)
+
+        # now change to (1, 2), previous (1, 1)
+        decoder_final_kersize = (1, 2)
+
+        self.phase_conv = nn.Sequential(
+            self.upsample_module_class(h.dense_channel, h.dense_channel,
+                                       (1, 3), (1, 2)),
+            nn.InstanceNorm2d(h.dense_channel, affine=True),
+            nn.PReLU(h.dense_channel))
+        self.phase_conv_r = nn.Conv2d(h.dense_channel, out_channel,
+                                      decoder_final_kersize)
+        self.phase_conv_i = nn.Conv2d(h.dense_channel, out_channel,
+                                      decoder_final_kersize)
+
+    def forward(self, x):
+        if self.dense_block is not None:
+            x = self.dense_block(x)
+        x = self.phase_conv(x)
+        x_r = self.phase_conv_r(x)
+        x_i = self.phase_conv_i(x)
+        x = torch.atan2(x_i, x_r)
+        return x
--- a/modelscope/models/audio/ans/zipenhancer_layers/scaling.py
+++ b/modelscope/models/audio/ans/zipenhancer_layers/scaling.py
--- a/modelscope/models/audio/ans/zipenhancer_layers/zipenhancer_layer.py
+++ b/modelscope/models/audio/ans/zipenhancer_layers/zipenhancer_layer.py
@@ -0,0 +1,501 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import copy
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor, nn
+
+from .scaling import FloatLike, ScheduledFloat, convert_num_channels
+from .zipformer import (BypassModule, CompactRelPositionalEncoding,
+                        SimpleDownsample, SimpleUpsample,
+                        Zipformer2EncoderLayer)
+
+
+class DualPathZipformer2Encoder(nn.Module):
+    r"""DualPathZipformer2Encoder is a stack of N encoder layers
+    it has two kinds of EncoderLayer including F_Zipformer2EncoderLayer and T_Zipformer2EncoderLayer
+    the features are modeling with the shape of
+    [B, C, T, F] -> [F, T * B, C] -> -> [B, C, T, F] -> [T, F * B, C] -> [B, C, T, F]
+
+    Args:
+        encoder_layer: an instance of the Zipformer2EncoderLayer() class (required).
+        num_layers: the number of sub-encoder-layers in the encoder (required).
+       pos_dim: the dimension for the relative positional encoding
+
+    Examples::
+        >>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8)
+        >>> dualpath_zipformer_encoder = DualPathZipformer2Encoder(encoder_layer, num_layers=6)
+        >>> src = torch.rand(10, 512, 161, 101)
+        >>> out = dualpath_zipformer_encoder(src)
+    """
+
+    def __init__(
+        self,
+        encoder_layer: nn.Module,
+        num_layers: int,
+        pos_dim: int,
+        dropout: float,
+        warmup_begin: float,
+        warmup_end: float,
+        initial_layerdrop_rate: float = 0.5,
+        final_layerdrop_rate: float = 0.05,
+        bypass_layer=None,
+    ) -> None:
+        """
+        Initialize the DualPathZipformer2Encoder module with the specified
+        encoder layer, number of layers, positional dimension, dropout rate, warmup period, and layer drop rates.
+        """
+        super().__init__()
+        self.encoder_pos = CompactRelPositionalEncoding(
+            pos_dim, dropout_rate=0.15, length_factor=1.0)
+
+        self.f_layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for i in range(num_layers)])
+        self.t_layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for i in range(num_layers)])
+        self.bypass_layers = nn.ModuleList(
+            [bypass_layer for i in range(num_layers * 2)])
+        self.num_layers = num_layers
+
+        assert 0 <= warmup_begin <= warmup_end, (warmup_begin, warmup_end)
+
+        delta = (1.0 / num_layers) * (warmup_end - warmup_begin)
+        cur_begin = warmup_begin  # interpreted as a training batch index
+        for i in range(num_layers):
+            cur_end = cur_begin + delta
+            self.f_layers[i].bypass.skip_rate = ScheduledFloat(
+                (cur_begin, initial_layerdrop_rate),
+                (cur_end, final_layerdrop_rate),
+                default=0.0,
+            )
+            self.t_layers[i].bypass.skip_rate = ScheduledFloat(
+                (cur_begin, initial_layerdrop_rate),
+                (cur_end, final_layerdrop_rate),
+                default=0.0,
+            )
+            cur_begin = cur_end
+
+    def forward(
+        self,
+        src: Tensor,
+        chunk_size: int = -1,
+        feature_mask: Union[Tensor, float] = 1.0,
+        attn_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        r"""Pass the input through the encoder layers in a dual-path manner, processing both temporal and frequency dimensions.
+
+        Args:
+            src: the dual-path sequence to the encoder (required):
+                shape (batch_size, embedding_dim, seq_len, frequency_len).
+            chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking. No used.
+            feature_mask: something that broadcasts with src, that we'll multiply `src`
+               by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim)
+            attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len),
+                 interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
+                 True means masked position. May be None.
+            src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len); True means
+                 masked position.  May be None.
+
+        Returns: a Tensor with the same shape as src.
+        """
+
+        # src: (b, c, t, f)
+        b, c, t, f = src.size()
+        src_f = src.permute(3, 0, 2, 1).contiguous().view(f, b * t, c)
+        src_t = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c)
+        pos_emb_f = self.encoder_pos(src_f)
+        pos_emb_t = self.encoder_pos(src_t)
+
+        output = src
+
+        if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+            output = output * feature_mask
+
+        for i in range(len(self.f_layers)):
+            # output_org = output
+            # (b, c, t, f)
+            output_f_org = output.permute(3, 2, 0,
+                                          1).contiguous()  # (f, t, b, c)
+            output_f = output_f_org.view(f, t * b, c)
+            # (f, t * b, c)
+            output_f = self.f_layers[i](
+                output_f,
+                pos_emb_f,
+                # chunk_size=chunk_size,
+                # attn_mask=attn_mask,
+                src_key_padding_mask=src_key_padding_mask,
+            )
+            output_f = output_f.view(f, t, b, c)
+            output_f = self.bypass_layers[i * 2](output_f_org, output_f)
+
+            # (f, t, b, c)
+            output = output_f.permute(2, 3, 1, 0).contiguous()
+            # (b, c, t, f)
+            # output = self.bypass_layers[i * 2](output_org, output)
+
+            # output_org = output
+
+            output_t_org = output.permute(2, 3, 0,
+                                          1).contiguous()  # (t, f, b, c)
+            output_t = output_t_org.view(t, f * b, c)
+            output_t = self.t_layers[i](
+                output_t,
+                pos_emb_t,
+                # chunk_size=chunk_size,
+                # attn_mask=attn_mask,
+                src_key_padding_mask=src_key_padding_mask,
+            )
+            output_t = output_t.view(t, f, b, c)
+            output_t = self.bypass_layers[i * 2 + 1](output_t_org, output_t)
+            # (t, f, b, c)
+
+            output = output_t.permute(2, 3, 0, 1).contiguous()
+            # (b, c, t, f)
+            # output = self.bypass_layers[i * 2 + 1](output_org, output)
+
+            if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+                output = output * feature_mask
+
+        return output
+
+
+class DualPathDownsampledZipformer2Encoder(nn.Module):
+    r"""
+    DualPathDownsampledZipformer2Encoder is a dual-path zipformer encoder evaluated at a reduced frame rate,
+    after convolutional downsampling, and then upsampled again at the output, and combined
+    with the origin input, so that the output has the same shape as the input.
+    The features are downsampled-upsampled at the time and frequency domain.
+
+    """
+
+    def __init__(self, encoder: nn.Module, dim: int, t_downsample: int,
+                 f_downsample: int, dropout: FloatLike):
+        """
+        Initialize the DualPathDownsampledZipformer2Encoder module with the specified
+        encoder, dimension, temporal and frequency downsampling factors r, and dropout rate.
+        """
+        super(DualPathDownsampledZipformer2Encoder, self).__init__()
+        self.downsample_factor = t_downsample
+        self.t_downsample_factor = t_downsample
+        self.f_downsample_factor = f_downsample
+
+        if self.t_downsample_factor != 1:
+            self.downsample_t = SimpleDownsample(dim, t_downsample, dropout)
+            self.upsample_t = SimpleUpsample(dim, t_downsample)
+        if self.f_downsample_factor != 1:
+            self.downsample_f = SimpleDownsample(dim, f_downsample, dropout)
+            self.upsample_f = SimpleUpsample(dim, f_downsample)
+
+        # self.num_layers = encoder.num_layers
+        self.encoder = encoder
+
+        self.out_combiner = BypassModule(dim, straight_through_rate=0)
+
+    def forward(
+        self,
+        src: Tensor,
+        chunk_size: int = -1,
+        feature_mask: Union[Tensor, float] = 1.0,
+        attn_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        r"""Downsample the input, process through the encoder, and then upsample back to the original shape.
+
+        Args:
+            src: the sequence to the encoder (required): shape (batch_size, embedding_dim, seq_len, frequency_len).
+            feature_mask: 1.0
+            attn_mask: None
+            src_key_padding_mask: None.
+
+        Returns: a Tensor with the same shape as src. (batch_size, embedding_dim, seq_len, frequency_len)
+        """
+        # src: (b, c, t, f)
+        b, c, t, f = src.size()
+        # print(src.size())
+
+        src_orig = src.permute(2, 3, 0, 1)  # (t, f, b, c)
+
+        # (b, c, t, f)
+        src = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c)
+        # -> (t, b * f, c)
+        if self.t_downsample_factor != 1:
+            src = self.downsample_t(src)
+        # (t//ds + 1, b * f, c)
+        downsample_t = src.size(0)
+        src = src.view(downsample_t, b, f,
+                       c).permute(2, 1, 0,
+                                  3).contiguous().view(f, b * downsample_t, c)
+        # src = self.upsample_f(src)
+        if self.f_downsample_factor != 1:
+            src = self.downsample_f(src)
+        # (f//ds + 1, b * downsample_t, c)
+        downsample_f = src.size(0)
+        src = src.view(downsample_f, b, downsample_t, c).permute(1, 3, 2, 0)
+        # (b, c, downsample_t, downsample_f)
+        # print(src.size())
+
+        # ds = self.downsample_factor
+        # if attn_mask is not None:
+        #     attn_mask = attn_mask[::ds, ::ds]
+
+        src = self.encoder(
+            src,
+            chunk_size=chunk_size,
+            feature_mask=feature_mask,
+            attn_mask=attn_mask,
+            src_key_padding_mask=src_key_padding_mask,
+        )
+
+        # (b, c, downsample_t, downsample_f)
+        src = src.permute(3, 0, 2,
+                          1).contiguous().view(downsample_f, b * downsample_t,
+                                               c)
+        if self.f_downsample_factor != 1:
+            src = self.upsample_f(src)
+        # (f, b * downsample_t, c)
+        src = src[:f].view(f, b, downsample_t,
+                           c).permute(2, 1, 0, 3).contiguous().view(
+                               downsample_t, b * f, c)
+        # (downsample_t, b * f, c)
+        if self.t_downsample_factor != 1:
+            src = self.upsample_t(src)
+        # (t, b * f, c)
+        src = src[:t].view(t, b, f, c).permute(0, 2, 1, 3).contiguous()
+        # (t, f, b, c)
+        out = self.out_combiner(src_orig, src)
+        # (t, f, b, c)
+
+        out = out.permute(2, 3, 0, 1).contiguous()
+        # (b, c, t, f)
+        # print(out.size())
+
+        # remove any extra frames that are not a multiple of downsample_factor
+        # src = src[: src_orig.shape[0]] # slice here
+
+        return out
+
+
+class Zipformer2DualPathEncoder(nn.Module):
+
+    def __init__(
+        self,
+        output_downsampling_factor: int = 2,
+        downsampling_factor: Tuple[int] = (2, 4),
+        f_downsampling_factor: Tuple[int] = None,
+        encoder_dim: Union[int, Tuple[int]] = 384,
+        num_encoder_layers: Union[int, Tuple[int]] = 4,
+        encoder_unmasked_dim: Union[int, Tuple[int]] = 256,
+        query_head_dim: Union[int, Tuple[int]] = 24,
+        pos_head_dim: Union[int, Tuple[int]] = 4,
+        value_head_dim: Union[int, Tuple[int]] = 12,
+        num_heads: Union[int, Tuple[int]] = 8,
+        feedforward_dim: Union[int, Tuple[int]] = 1536,
+        cnn_module_kernel: Union[int, Tuple[int]] = 31,
+        pos_dim: int = 192,
+        dropout: FloatLike = None,  # see code below for default
+        warmup_batches: float = 4000.0,
+        causal: bool = False,
+        chunk_size: Tuple[int] = [-1],
+        left_context_frames: Tuple[int] = [-1],
+    ):
+        """
+        Initialize the Zipformer2DualPathEncoder module.
+        Zipformer2DualPathEncoder processes the hidden features of the noisy speech using dual-path modeling.
+        It has two kinds of blocks: DualPathZipformer2Encoder and DualPathDownsampledZipformer2Encoder.
+        DualPathZipformer2Encoder processes the 4D features with the shape of [B, C, T, F].
+        DualPathDownsampledZipformer2Encoder first downsamples the hidden features
+        and processes features using dual-path modeling like DualPathZipformer2Encoder.
+
+        Args:
+        Various hyperparameters and settings for the encoder.
+        """
+        super(Zipformer2DualPathEncoder, self).__init__()
+
+        if dropout is None:
+            dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
+
+        def _to_tuple(x):
+            """Converts a single int or a 1-tuple of an int to a tuple with the same length
+            as downsampling_factor"""
+            if isinstance(x, int):
+                x = (x, )
+            if len(x) == 1:
+                x = x * len(downsampling_factor)
+            else:
+                assert len(x) == len(downsampling_factor) and isinstance(
+                    x[0], int)
+            return x
+
+        self.output_downsampling_factor = output_downsampling_factor  # int
+        self.downsampling_factor = downsampling_factor  # tuple
+
+        if f_downsampling_factor is None:
+            f_downsampling_factor = downsampling_factor
+        self.f_downsampling_factor = _to_tuple(f_downsampling_factor)
+
+        self.encoder_dim = encoder_dim = _to_tuple(encoder_dim)  # tuple
+        self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple(
+            encoder_unmasked_dim)  # tuple
+        num_encoder_layers = _to_tuple(num_encoder_layers)
+        self.num_encoder_layers = num_encoder_layers
+        self.query_head_dim = query_head_dim = _to_tuple(query_head_dim)
+        self.value_head_dim = value_head_dim = _to_tuple(value_head_dim)
+        pos_head_dim = _to_tuple(pos_head_dim)
+        self.num_heads = num_heads = _to_tuple(num_heads)
+        feedforward_dim = _to_tuple(feedforward_dim)
+        self.cnn_module_kernel = cnn_module_kernel = _to_tuple(
+            cnn_module_kernel)
+
+        self.causal = causal
+        self.chunk_size = chunk_size
+        self.left_context_frames = left_context_frames
+
+        for u, d in zip(encoder_unmasked_dim, encoder_dim):
+            assert u <= d
+
+        # each one will be Zipformer2Encoder or DownsampledZipformer2Encoder
+        encoders = []
+
+        num_encoders = len(downsampling_factor)
+        # "1,2,4,8,4,2",
+
+        for i in range(num_encoders):
+            encoder_layer = Zipformer2EncoderLayer(
+                embed_dim=encoder_dim[i],
+                pos_dim=pos_dim,
+                num_heads=num_heads[i],
+                query_head_dim=query_head_dim[i],
+                pos_head_dim=pos_head_dim[i],
+                value_head_dim=value_head_dim[i],
+                feedforward_dim=feedforward_dim[i],
+                dropout=dropout,
+                cnn_module_kernel=cnn_module_kernel[i],
+                causal=causal,
+            )
+
+            # For the segment of the warmup period, we let the Conv2dSubsampling
+            # layer learn something.  Then we start to warm up the other encoders.
+            encoder = DualPathZipformer2Encoder(
+                encoder_layer,
+                num_encoder_layers[i],
+                pos_dim=pos_dim,
+                dropout=dropout,
+                warmup_begin=warmup_batches * (i + 1) / (num_encoders + 1),
+                warmup_end=warmup_batches * (i + 2) / (num_encoders + 1),
+                final_layerdrop_rate=0.035 * (downsampling_factor[i]**0.5),
+                bypass_layer=BypassModule(
+                    encoder_dim[i], straight_through_rate=0),
+            )
+
+            if downsampling_factor[i] != 1 or f_downsampling_factor[i] != 1:
+                encoder = DualPathDownsampledZipformer2Encoder(
+                    encoder,
+                    dim=encoder_dim[i],
+                    t_downsample=downsampling_factor[i],
+                    f_downsample=f_downsampling_factor[i],
+                    dropout=dropout,
+                )
+
+            encoders.append(encoder)
+
+        self.encoders = nn.ModuleList(encoders)
+
+        self.downsample_output = SimpleDownsample(
+            max(encoder_dim),
+            downsample=output_downsampling_factor,
+            dropout=dropout)
+
+    def forward(self, x):
+        """
+        Forward pass of the Zipformer2DualPathEncoder module.
+
+        Args:
+        x (Tensor): Input tensor of shape [B, C, T, F].
+
+        Returns:
+        Tensor: Output tensor after passing through the encoder.
+        """
+        outputs = []
+
+        # if torch.jit.is_scripting() or torch.jit.is_tracing():
+        #     feature_masks = [1.0] * len(self.encoder_dim)
+        # else:
+        # feature_masks = self.get_feature_masks(x)
+        feature_masks = [1.0] * len(self.encoder_dim)
+        attn_mask = None
+
+        chunk_size = -1
+        # left_context_chunks = -1
+
+        for i, module in enumerate(self.encoders):
+
+            x = convert_num_channels(x, self.encoder_dim[i])
+
+            x = module(
+                x,
+                chunk_size=chunk_size,
+                feature_mask=feature_masks[i],
+                src_key_padding_mask=None,
+                attn_mask=attn_mask,
+            )
+            outputs.append(x)
+
+        # (b, c, t, f)
+        return x
+
+
+if __name__ == '__main__':
+
+    # {2,2,2,2,2,2} {192,256,256,256,256,256} {512,768,768,768,768,768}
+    downsampling_factor = (1, 2, 4, 3)  #
+    encoder_dim = (16, 32, 64, 64)
+    pos_dim = 48  # zipformer base设置
+    num_heads = (4, 4, 4, 4)  # "4,4,4,8,4,4"
+    query_head_dim = (16, ) * len(downsampling_factor)  # 32
+    pos_head_dim = (4, ) * len(downsampling_factor)  # 4
+    value_head_dim = (12, ) * len(downsampling_factor)  # 12
+    feedforward_dim = (32, 64, 128, 128)  #
+    dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
+    cnn_module_kernel = (15, ) * len(downsampling_factor)  # 31,31,15,15,15,31
+    causal = False
+    encoder_unmasked_dim = (16, ) * len(downsampling_factor)
+
+    num_encoder_layers = (1, 1, 1, 1)
+    warmup_batches = 4000.0
+
+    net = Zipformer2DualPathEncoder(
+        output_downsampling_factor=1,
+        downsampling_factor=downsampling_factor,
+        num_encoder_layers=num_encoder_layers,
+        encoder_dim=encoder_dim,
+        encoder_unmasked_dim=encoder_unmasked_dim,
+        query_head_dim=query_head_dim,
+        pos_head_dim=pos_head_dim,
+        value_head_dim=value_head_dim,
+        pos_dim=pos_dim,
+        num_heads=num_heads,
+        feedforward_dim=feedforward_dim,
+        cnn_module_kernel=cnn_module_kernel,
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+        warmup_batches=warmup_batches,
+        causal=causal,
+    )
+
+    # net = DownsampledZipformer2Encoder(
+    #     None, 128, 2, 0.
+    # )
+    # x = torch.randn((101, 2, 128))
+    b = 4
+    t = 321
+    f = 101
+    c = 64
+
+    # x = torch.randn((101, 2, 128))
+    x = torch.randn((b, c, t, f))
+
+    x = net(x)
+    print(x.size())
--- a/modelscope/models/audio/ans/zipenhancer_layers/zipformer.py
+++ b/modelscope/models/audio/ans/zipenhancer_layers/zipformer.py
--- a/modelscope/pipelines/audio/ans_pipeline.py
+++ b/modelscope/pipelines/audio/ans_pipeline.py
@@ -122,3 +122,127 @@ class ANSPipeline(Pipeline):
                np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16),
                self.SAMPLE_RATE)
        return inputs
+
+
+@PIPELINES.register_module(
+    Tasks.acoustic_noise_suppression,
+    module_name=Pipelines.speech_zipenhancer_ans_multiloss_16k_base)
+class ANSZipEnhancerPipeline(Pipeline):
+    r"""ANS (Acoustic Noise Suppression) Inference Pipeline .
+
+    When invoke the class with pipeline.__call__(), it accept only one parameter:
+        inputs(str): the path of wav file
+    """
+    SAMPLE_RATE = 16000
+
+    def __init__(self, model, **kwargs):
+        """
+        use `model` and `preprocessor` to create a kws pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        self.model.eval()
+        self.stream_mode = kwargs.get('stream_mode', False)
+
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
+        if self.stream_mode:
+            raise TypeError('This model does not support stream mode!')
+        if isinstance(inputs, bytes):
+            data1, fs = sf.read(io.BytesIO(inputs))
+        elif isinstance(inputs, str):
+            # file_bytes = File.read(inputs)
+            # data1, fs = sf.read(io.BytesIO(file_bytes))
+            data1, fs = sf.read(inputs)
+        else:
+            raise TypeError(f'Unsupported type {type(inputs)}.')
+        if len(data1.shape) > 1:
+            data1 = data1[:, 0]
+        if fs != self.SAMPLE_RATE:
+            data1 = librosa.resample(
+                data1, orig_sr=fs, target_sr=self.SAMPLE_RATE)
+        data1 = audio_norm(data1)
+        data = data1.astype(np.float32)
+        inputs = np.reshape(data, [1, data.shape[0]])
+        return {'ndarray': inputs, 'nsamples': data.shape[0]}
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        ndarray = inputs['ndarray']
+        if isinstance(ndarray, torch.Tensor):
+            ndarray = ndarray.cpu().numpy()
+        nsamples = inputs['nsamples']
+        decode_do_segement = False
+        window = 16000 * 2  # 2s
+        stride = int(window * 0.75)
+        print('inputs:{}'.format(ndarray.shape))
+        b, t = ndarray.shape  # size()
+        if t > window * 5:  # 10s
+            decode_do_segement = True
+            print('decode_do_segement')
+
+        if t < window:
+            ndarray = np.concatenate(
+                [ndarray, np.zeros((ndarray.shape[0], window - t))], 1)
+        elif decode_do_segement:
+            if t < window + stride:
+                padding = window + stride - t
+                print('padding: {}'.format(padding))
+                ndarray = np.concatenate(
+                    [ndarray, np.zeros((ndarray.shape[0], padding))], 1)
+            else:
+                if (t - window) % stride != 0:
+                    # padding = t - (t - window) // stride * stride
+                    padding = (
+                        (t - window) // stride + 1) * stride + window - t
+                    print('padding: {}'.format(padding))
+                    ndarray = np.concatenate(
+                        [ndarray,
+                         np.zeros((ndarray.shape[0], padding))], 1)
+        # else:
+        #     if (t - window) % stride != 0:
+        #         padding = t - (t - window) // stride * stride
+        #         print('padding: {}'.format(padding))
+        #         ndarray = np.concatenate(
+        #             [ndarray, np.zeros((ndarray.shape[0], padding))], 1)
+        print('inputs after padding:{}'.format(ndarray.shape))
+        with torch.no_grad():
+            ndarray = torch.from_numpy(np.float32(ndarray)).to(self.device)
+            b, t = ndarray.shape
+            if decode_do_segement:
+                outputs = np.zeros(t)
+                give_up_length = (window - stride) // 2
+                current_idx = 0
+                while current_idx + window <= t:
+                    # print('current_idx: {}'.format(current_idx))
+                    print(
+                        '\rcurrent_idx: {} {:.2f}%'.format(
+                            current_idx, current_idx * 100 / t),
+                        end='')
+                    tmp_input = dict(noisy=ndarray[:, current_idx:current_idx
+                                                   + window])
+                    tmp_output = self.model(
+                        tmp_input, )['wav_l2'][0].cpu().numpy()
+                    end_index = current_idx + window - give_up_length
+                    if current_idx == 0:
+                        outputs[current_idx:
+                                end_index] = tmp_output[:-give_up_length]
+                    else:
+                        outputs[current_idx
+                                + give_up_length:end_index] = tmp_output[
+                                    give_up_length:-give_up_length]
+                    current_idx += stride
+                print('\rcurrent_idx: {} {:.2f}%'.format(current_idx, 100))
+            else:
+                outputs = self.model(
+                    dict(noisy=ndarray))['wav_l2'][0].cpu().numpy()
+        outputs = (outputs[:nsamples] * 32768).astype(np.int16).tobytes()
+        return {OutputKeys.OUTPUT_PCM: outputs}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        if 'output_path' in kwargs.keys():
+            sf.write(
+                kwargs['output_path'],
+                np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16),
+                self.SAMPLE_RATE)
+        return inputs
--- a/requirements/datasets.txt
+++ b/requirements/datasets.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=3.0.0
+datasets>=3.0.0,<=3.0.1
 einops
 oss2
 Pillow
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=3.0.0
+datasets>=3.0.0,<=3.0.1
 einops
 oss2
 Pillow
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -150,6 +150,36 @@ class SpeechSignalProcessTest(unittest.TestCase):
                    w.write(pcm)
                    audio = f.read(block_size)

+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_zipenhancer_ans(self):
+        model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
+        ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        ans(os.path.join(os.getcwd(), NOISE_SPEECH_FILE),
+            output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_zipenhancer_ans_url(self):
+        model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
+        ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        ans(NOISE_SPEECH_URL, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_zipenhancer_ans_bytes(self):
+        model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
+        ans = pipeline(
+            Tasks.acoustic_noise_suppression,
+            model=model_id,
+            pipeline_name=Pipelines.speech_zipenhancer_ans_multiloss_16k_base)
+        output_path = os.path.abspath('output.wav')
+        with open(os.path.join(os.getcwd(), NOISE_SPEECH_FILE), 'rb') as f:
+            data = f.read()
+            ans(data, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+

 if __name__ == '__main__':
    unittest.main()