Merge commit 'e71057cacf6b9f977e20f41e6d767b4ad1557171'

* commit 'e71057cacf6b9f977e20f41e6d767b4ad1557171':
  feat(audio/ans): Add ZipEnhancer and related layers for acoustic nois… (#1019)
  Fix timestamp in docker build (#1049)
  Fix pypi mirror (#1048)
  Fix build error (#1047)
  hotfix for datasets 3.0.2 (#1046)
  Update docker scripts (#1044)
  Add docker workflow name (#1043)
This commit is contained in:
yuze.zyz
2024-10-24 20:42:11 +08:00
13 changed files with 3291 additions and 20 deletions

View File

@@ -1,24 +1,30 @@
name: Build Docker Images
name: Build Docker Image
on:
workflow_dispatch:
inputs:
workflow_name:
description: 'The specific name of this build'
required: true
default: 'build'
modelscope_branch:
description: 'ModelScope branch to build from'
description: 'ModelScope branch to build from(release/x.xx)'
required: true
image_type:
description: 'The image type to build'
description: 'The image type to build(cpu/gpu/llm)'
required: true
modelscope_version:
description: 'ModelScope version to use'
description: 'ModelScope version to use(x.xx.x)'
required: true
swift_branch:
description: 'SWIFT branch to use'
description: 'SWIFT branch to use(release/x.xx)'
required: true
other_params:
description: 'Other params in --xxx xxx'
required: false
run-name: Docker-${{ inputs.modelscope_branch }}-${{ inputs.image_type }}-${{ inputs.workflow_name }}-by-@${{ github.actor }}
jobs:
build:
runs-on: [modelscope-self-hosted-us]

View File

@@ -12,10 +12,6 @@ RUN apt-get update && \
{extra_content}
RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
pip config set install.trusted-host mirrors.aliyun.com && \
cp /tmp/resources/ubuntu2204.aliyun /etc/apt/sources.list
COPY {meta_file} /tmp/install.sh
RUN sh /tmp/install.sh {version_args}
@@ -28,6 +24,10 @@ RUN cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b {modelscope_branch} --single
RUN cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b {swift_branch} --single-branch https://github.com/modelscope/ms-swift.git && cd ms-swift && pip install .[all] && cd / && rm -fr /tmp/ms-swift && pip cache purge;
RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
pip config set install.trusted-host mirrors.aliyun.com && \
cp /tmp/resources/ubuntu2204.aliyun /etc/apt/sources.list
ENV SETUPTOOLS_USE_DISTUTILS=stdlib
ENV VLLM_USE_MODELSCOPE=True
ENV LMDEPLOY_USE_MODELSCOPE=True

View File

@@ -1,9 +1,12 @@
import argparse
import os
from datetime import datetime
from typing import Any
docker_registry = os.environ['DOCKER_REGISTRY']
assert docker_registry, 'You must pass a valid DOCKER_REGISTRY'
timestamp = datetime.now()
formatted_time = timestamp.strftime('%Y%m%d%H%M%S')
class Builder:
@@ -85,12 +88,16 @@ class BaseCPUImageBuilder(Builder):
return content
def build(self):
image_tag = f'{docker_registry}:ubuntu{self.args.ubuntu_version}-torch{self.args.torch_version}-base'
image_tag = (
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
f'torch{self.args.torch_version}-base')
return os.system(
f'DOCKER_BUILDKIT=0 docker build -t {image_tag} -f Dockerfile .')
def push(self):
image_tag = f'{docker_registry}:ubuntu{self.args.ubuntu_version}-torch{self.args.torch_version}-base'
image_tag = (
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
f'torch{self.args.torch_version}-base')
return os.system(f'docker push {image_tag}')
@@ -110,14 +117,14 @@ class BaseGPUImageBuilder(Builder):
def build(self) -> int:
image_tag = (
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
return os.system(
f'DOCKER_BUILDKIT=0 docker build -t {image_tag} -f Dockerfile .')
def push(self):
image_tag = (
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
return os.system(f'docker push {image_tag}')
@@ -129,7 +136,9 @@ class CPUImageBuilder(Builder):
version_args = (
f'{self.args.torch_version} {self.args.torchvision_version} '
f'{self.args.torchaudio_version}')
base_image = f'{docker_registry}:ubuntu{self.args.ubuntu_version}-torch{self.args.torch_version}-base'
base_image = (
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}'
f'-torch{self.args.torch_version}-base')
extra_content = """\nRUN pip install adaseq\nRUN pip install pai-easycv"""
with open('docker/Dockerfile.ubuntu', 'r') as f:
@@ -157,7 +166,17 @@ class CPUImageBuilder(Builder):
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
f'torch{self.args.torch_version}-{self.args.modelscope_version}-test'
)
return os.system(f'docker push {image_tag}')
ret = os.system(f'docker push {image_tag}')
if ret != 0:
return ret
image_tag2 = (
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
f'torch{self.args.torch_version}-{self.args.modelscope_version}-{formatted_time}-test'
)
ret = os.system(f'docker tag {image_tag} {image_tag2}')
if ret != 0:
return ret
return os.system(f'docker push {image_tag2}')
class GPUImageBuilder(Builder):
@@ -170,7 +189,7 @@ class GPUImageBuilder(Builder):
f'{self.args.vllm_version} {self.args.lmdeploy_version} {self.args.autogptq_version}'
)
base_image = (
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
with open('docker/Dockerfile.ubuntu', 'r') as f:
content = f.read()
@@ -196,7 +215,17 @@ class GPUImageBuilder(Builder):
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-'
f'{self.args.modelscope_version}-test')
return os.system(f'docker push {image_tag}')
ret = os.system(f'docker push {image_tag}')
if ret != 0:
return ret
image_tag2 = (
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-'
f'{self.args.modelscope_version}-{formatted_time}-test')
ret = os.system(f'docker tag {image_tag} {image_tag2}')
if ret != 0:
return ret
return os.system(f'docker push {image_tag2}')
class LLMImageBuilder(Builder):
@@ -253,7 +282,17 @@ class LLMImageBuilder(Builder):
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
f'{self.args.python_tag}-torch{self.args.torch_version}-{self.args.modelscope_version}-LLM-test'
)
return os.system(f'docker push {image_tag}')
ret = os.system(f'docker push {image_tag}')
if ret != 0:
return ret
image_tag2 = (
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
f'{self.args.python_tag}-torch{self.args.torch_version}-'
f'{self.args.modelscope_version}-LLM-{formatted_time}-test')
ret = os.system(f'docker tag {image_tag} {image_tag2}')
if ret != 0:
return ret
return os.system(f'docker push {image_tag2}')
parser = argparse.ArgumentParser()

View File

@@ -193,6 +193,7 @@ class Models(object):
# audio models
sambert_hifigan = 'sambert-hifigan'
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base'
speech_dfsmn_ans = 'speech_dfsmn_ans'
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot'
@@ -551,6 +552,7 @@ class Pipelines(object):
sambert_hifigan_tts = 'sambert-hifigan-tts'
speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base'
speech_dfsmn_ans_psm_48k_causal = 'speech_dfsmn_ans_psm_48k_causal'
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
speech_separation = 'speech-separation'

View File

@@ -0,0 +1,210 @@
#!/usr/bin/env python3
#
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import random
from typing import Dict
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from modelscope.metainfo import Models
from modelscope.models import TorchModel
from modelscope.models.base import Tensor
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from .zipenhancer_layers.generator import (DenseEncoder, MappingDecoder,
PhaseDecoder)
from .zipenhancer_layers.scaling import ScheduledFloat
from .zipenhancer_layers.zipenhancer_layer import Zipformer2DualPathEncoder
@MODELS.register_module(
Tasks.acoustic_noise_suppression,
module_name=Models.speech_zipenhancer_ans_multiloss_16k_base)
class ZipenhancerDecorator(TorchModel):
def __init__(self, model_dir: str, *args, **kwargs):
super().__init__(model_dir, *args, **kwargs)
h = dict(
num_tsconformers=kwargs['num_tsconformers'],
dense_channel=kwargs['dense_channel'],
former_conf=kwargs['former_conf'],
batch_first=kwargs['batch_first'],
model_num_spks=kwargs['model_num_spks'],
)
# num_tsconformers, dense_channel, former_name, former_conf, batch_first, model_num_spks
h = AttrDict(h)
self.model = ZipEnhancer(h)
model_bin_file = os.path.join(model_dir,
ModelFile.TORCH_MODEL_BIN_FILE)
if os.path.exists(model_bin_file):
checkpoint = torch.load(
model_bin_file, map_location=torch.device('cpu'))
if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
# the new trained model by user is based on ZipenhancerDecorator
self.load_state_dict(checkpoint['state_dict'])
else:
# The released model on Modelscope is based on Zipenhancer
# self.model.load_state_dict(checkpoint, strict=False)
self.model.load_state_dict(checkpoint['generator'])
# print(checkpoint['generator'].keys())
def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
n_fft = 400
hop_size = 100
win_size = 400
noisy_wav = inputs['noisy']
norm_factor = torch.sqrt(noisy_wav.shape[1]
/ torch.sum(noisy_wav**2.0))
noisy_audio = (noisy_wav * norm_factor)
mag, pha, com = mag_pha_stft(
noisy_audio,
n_fft,
hop_size,
win_size,
compress_factor=0.3,
center=True)
amp_g, pha_g, com_g, _, others = self.model.forward(mag, pha)
wav = mag_pha_istft(
amp_g,
pha_g,
n_fft,
hop_size,
win_size,
compress_factor=0.3,
center=True)
wav = wav / norm_factor
output = {
'wav_l2': wav,
}
return output
class ZipEnhancer(nn.Module):
def __init__(self, h):
"""
Initialize the ZipEnhancer module.
Args:
h (object): Configuration object containing various hyperparameters and settings.
having num_tsconformers, former_name, former_conf, mask_decoder_type, ...
"""
super(ZipEnhancer, self).__init__()
self.h = h
num_tsconformers = h.num_tsconformers
self.num_tscblocks = num_tsconformers
self.dense_encoder = DenseEncoder(h, in_channel=2)
self.TSConformer = Zipformer2DualPathEncoder(
output_downsampling_factor=1,
dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
**h.former_conf)
self.mask_decoder = MappingDecoder(h, out_channel=h.model_num_spks)
self.phase_decoder = PhaseDecoder(h, out_channel=h.model_num_spks)
def forward(self, noisy_mag, noisy_pha): # [B, F, T]
"""
Forward pass of the ZipEnhancer module.
Args:
noisy_mag (Tensor): Noisy magnitude input tensor of shape [B, F, T].
noisy_pha (Tensor): Noisy phase input tensor of shape [B, F, T].
Returns:
Tuple: denoised magnitude, denoised phase, denoised complex representation,
(optional) predicted noise components, and other auxiliary information.
"""
others = dict()
noisy_mag = noisy_mag.unsqueeze(-1).permute(0, 3, 2, 1) # [B, 1, T, F]
noisy_pha = noisy_pha.unsqueeze(-1).permute(0, 3, 2, 1) # [B, 1, T, F]
x = torch.cat((noisy_mag, noisy_pha), dim=1) # [B, 2, T, F]
x = self.dense_encoder(x)
# [B, C, T, F]
x = self.TSConformer(x)
pred_mag = self.mask_decoder(x)
pred_pha = self.phase_decoder(x)
# b, c, t, f -> b, 1, t, f -> b, f, t, 1 -> b, f, t
denoised_mag = pred_mag[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
1).squeeze(-1)
# b, t, f
denoised_pha = pred_pha[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
1).squeeze(-1)
# b, t, f
denoised_com = torch.stack((denoised_mag * torch.cos(denoised_pha),
denoised_mag * torch.sin(denoised_pha)),
dim=-1)
return denoised_mag, denoised_pha, denoised_com, None, others
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def mag_pha_stft(y,
n_fft,
hop_size,
win_size,
compress_factor=1.0,
center=True):
hann_window = torch.hann_window(win_size, device=y.device)
stft_spec = torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window,
center=center,
pad_mode='reflect',
normalized=False,
return_complex=True)
stft_spec = torch.view_as_real(stft_spec)
mag = torch.sqrt(stft_spec.pow(2).sum(-1) + (1e-9))
pha = torch.atan2(stft_spec[:, :, :, 1], stft_spec[:, :, :, 0] + (1e-5))
# Magnitude Compression
mag = torch.pow(mag, compress_factor)
com = torch.stack((mag * torch.cos(pha), mag * torch.sin(pha)), dim=-1)
return mag, pha, com
def mag_pha_istft(mag,
pha,
n_fft,
hop_size,
win_size,
compress_factor=1.0,
center=True):
# Magnitude Decompression
mag = torch.pow(mag, (1.0 / compress_factor))
com = torch.complex(mag * torch.cos(pha), mag * torch.sin(pha))
hann_window = torch.hann_window(win_size, device=com.device)
wav = torch.istft(
com,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window,
center=center)
return wav

View File

@@ -0,0 +1,220 @@
#!/usr/bin/env python3
#
# Copyright (c) Alibaba, Inc. and its affiliates.
# Part of the implementation is borrowed and modified from MP-SENet,
# public available at https://github.com/yxlu-0102/MP-SENet
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
class SubPixelConvTranspose2d(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size=(1, 3),
stride=(1, 2),
padding=(0, 1)):
super(SubPixelConvTranspose2d, self).__init__()
self.upscale_width_factor = stride[1]
self.conv1 = nn.Conv2d(
in_channels,
out_channels * self.upscale_width_factor,
kernel_size=kernel_size,
padding=padding) # only change the width
def forward(self, x):
b, c, t, f = x.size()
# Use conv1 for upsampling, followed by expansion only in the width dimension.
x = self.conv1(x)
# print(x.size())
# Note: Here we do not directly use PixelShuffle because we only intend to expand in the width dimension,
# whereas PixelShuffle operates simultaneously on both height and width, hence we manually adjust accordingly.
# b, 2c, t, f
# print(x.size())
x = x.view(b, c, self.upscale_width_factor, t,
f).permute(0, 1, 3, 4, 2).contiguous()
# b, c, 2, t, f -> b, c, t, f, 2
x = x.view(b, c, t, f * self.upscale_width_factor)
# b, c, t, 2f = 202
# x = nn.functional.pad(x, (0, 1))
# b, c, t, 2f = 202
return x
class DenseBlockV2(nn.Module):
"""
A denseblock for ZipEnhancer
"""
def __init__(self, h, kernel_size=(2, 3), depth=4):
super(DenseBlockV2, self).__init__()
self.h = h
self.depth = depth
self.dense_block = nn.ModuleList([])
for i in range(depth):
dil = 2**i
pad_length = kernel_size[0] + (dil - 1) * (kernel_size[0] - 1) - 1
dense_conv = nn.Sequential(
nn.ConstantPad2d((1, 1, pad_length, 0), value=0.),
nn.Conv2d(
h.dense_channel * (i + 1),
h.dense_channel,
kernel_size,
dilation=(dil, 1)),
# nn.Conv2d(h.dense_channel * (i + 1), h.dense_channel, kernel_size, dilation=(dil, 1),
# padding=get_padding_2d(kernel_size, (dil, 1))),
nn.InstanceNorm2d(h.dense_channel, affine=True),
nn.PReLU(h.dense_channel))
self.dense_block.append(dense_conv)
def forward(self, x):
skip = x
# b, c, t, f
for i in range(self.depth):
_x = skip
x = self.dense_block[i](_x)
# print(x.size())
skip = torch.cat([x, skip], dim=1)
return x
class DenseEncoder(nn.Module):
def __init__(self, h, in_channel):
"""
Initialize the DenseEncoder module.
Args:
h (object): Configuration object containing various hyperparameters and settings.
in_channel (int): Number of input channels. Example: mag + phase: 2 channels
"""
super(DenseEncoder, self).__init__()
self.h = h
self.dense_conv_1 = nn.Sequential(
nn.Conv2d(in_channel, h.dense_channel, (1, 1)),
nn.InstanceNorm2d(h.dense_channel, affine=True),
nn.PReLU(h.dense_channel))
self.dense_block = DenseBlockV2(h, depth=4)
encoder_pad_kersize = (0, 1)
# Here pad was originally (0, 0)now change to (0, 1)
self.dense_conv_2 = nn.Sequential(
nn.Conv2d(
h.dense_channel,
h.dense_channel, (1, 3), (1, 2),
padding=encoder_pad_kersize),
nn.InstanceNorm2d(h.dense_channel, affine=True),
nn.PReLU(h.dense_channel))
def forward(self, x):
"""
Forward pass of the DenseEncoder module.
Args:
x (Tensor): Input tensor of shape [B, C=in_channel, T, F].
Returns:
Tensor: Output tensor after passing through the dense encoder. Maybe: [B, C=dense_channel, T, F // 2].
"""
# print("x: {}".format(x.size()))
x = self.dense_conv_1(x) # [b, 64, T, F]
if self.dense_block is not None:
x = self.dense_block(x) # [b, 64, T, F]
x = self.dense_conv_2(x) # [b, 64, T, F//2]
return x
class BaseDecoder(nn.Module):
def __init__(self, h):
"""
Initialize the BaseDecoder module.
Args:
h (object): Configuration object containing various hyperparameters and settings.
including upsample_type, dense_block_type.
"""
super(BaseDecoder, self).__init__()
self.upsample_module_class = SubPixelConvTranspose2d
# for both mag and phase decoder
self.dense_block = DenseBlockV2(h, depth=4)
class MappingDecoder(BaseDecoder):
def __init__(self, h, out_channel=1):
"""
Initialize the MappingDecoderV3 module.
Args:
h (object): Configuration object containing various hyperparameters and settings.
out_channel (int): Number of output channels. Default is 1. The number of output spearkers.
"""
super(MappingDecoder, self).__init__(h)
decoder_final_kersize = (1, 2)
self.mask_conv = nn.Sequential(
self.upsample_module_class(h.dense_channel, h.dense_channel,
(1, 3), (1, 2)),
# nn.Conv2d(h.dense_channel, out_channel, (1, 1)),
nn.InstanceNorm2d(h.dense_channel, affine=True),
nn.PReLU(h.dense_channel),
nn.Conv2d(h.dense_channel, out_channel, decoder_final_kersize))
# Upsample at F dimension
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
"""
Forward pass of the MappingDecoderV3 module.
Args:
x (Tensor): Input tensor. [B, C, T, F]
Returns:
Tensor: Output tensor after passing through the decoder. [B, Num_Spks, T, F]
"""
if self.dense_block is not None:
x = self.dense_block(x)
x = self.mask_conv(x)
x = self.relu(x)
# b, c=1, t, f
return x
class PhaseDecoder(BaseDecoder):
def __init__(self, h, out_channel=1):
super(PhaseDecoder, self).__init__(h)
# now change to (1, 2), previous (1, 1)
decoder_final_kersize = (1, 2)
self.phase_conv = nn.Sequential(
self.upsample_module_class(h.dense_channel, h.dense_channel,
(1, 3), (1, 2)),
nn.InstanceNorm2d(h.dense_channel, affine=True),
nn.PReLU(h.dense_channel))
self.phase_conv_r = nn.Conv2d(h.dense_channel, out_channel,
decoder_final_kersize)
self.phase_conv_i = nn.Conv2d(h.dense_channel, out_channel,
decoder_final_kersize)
def forward(self, x):
if self.dense_block is not None:
x = self.dense_block(x)
x = self.phase_conv(x)
x_r = self.phase_conv_r(x)
x_i = self.phase_conv_i(x)
x = torch.atan2(x_i, x_r)
return x

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,501 @@
#!/usr/bin/env python3
#
# Copyright (c) Alibaba, Inc. and its affiliates.
import copy
from typing import List, Optional, Tuple, Union
import torch
from torch import Tensor, nn
from .scaling import FloatLike, ScheduledFloat, convert_num_channels
from .zipformer import (BypassModule, CompactRelPositionalEncoding,
SimpleDownsample, SimpleUpsample,
Zipformer2EncoderLayer)
class DualPathZipformer2Encoder(nn.Module):
r"""DualPathZipformer2Encoder is a stack of N encoder layers
it has two kinds of EncoderLayer including F_Zipformer2EncoderLayer and T_Zipformer2EncoderLayer
the features are modeling with the shape of
[B, C, T, F] -> [F, T * B, C] -> -> [B, C, T, F] -> [T, F * B, C] -> [B, C, T, F]
Args:
encoder_layer: an instance of the Zipformer2EncoderLayer() class (required).
num_layers: the number of sub-encoder-layers in the encoder (required).
pos_dim: the dimension for the relative positional encoding
Examples::
>>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8)
>>> dualpath_zipformer_encoder = DualPathZipformer2Encoder(encoder_layer, num_layers=6)
>>> src = torch.rand(10, 512, 161, 101)
>>> out = dualpath_zipformer_encoder(src)
"""
def __init__(
self,
encoder_layer: nn.Module,
num_layers: int,
pos_dim: int,
dropout: float,
warmup_begin: float,
warmup_end: float,
initial_layerdrop_rate: float = 0.5,
final_layerdrop_rate: float = 0.05,
bypass_layer=None,
) -> None:
"""
Initialize the DualPathZipformer2Encoder module with the specified
encoder layer, number of layers, positional dimension, dropout rate, warmup period, and layer drop rates.
"""
super().__init__()
self.encoder_pos = CompactRelPositionalEncoding(
pos_dim, dropout_rate=0.15, length_factor=1.0)
self.f_layers = nn.ModuleList(
[copy.deepcopy(encoder_layer) for i in range(num_layers)])
self.t_layers = nn.ModuleList(
[copy.deepcopy(encoder_layer) for i in range(num_layers)])
self.bypass_layers = nn.ModuleList(
[bypass_layer for i in range(num_layers * 2)])
self.num_layers = num_layers
assert 0 <= warmup_begin <= warmup_end, (warmup_begin, warmup_end)
delta = (1.0 / num_layers) * (warmup_end - warmup_begin)
cur_begin = warmup_begin # interpreted as a training batch index
for i in range(num_layers):
cur_end = cur_begin + delta
self.f_layers[i].bypass.skip_rate = ScheduledFloat(
(cur_begin, initial_layerdrop_rate),
(cur_end, final_layerdrop_rate),
default=0.0,
)
self.t_layers[i].bypass.skip_rate = ScheduledFloat(
(cur_begin, initial_layerdrop_rate),
(cur_end, final_layerdrop_rate),
default=0.0,
)
cur_begin = cur_end
def forward(
self,
src: Tensor,
chunk_size: int = -1,
feature_mask: Union[Tensor, float] = 1.0,
attn_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
r"""Pass the input through the encoder layers in a dual-path manner, processing both temporal and frequency dimensions.
Args:
src: the dual-path sequence to the encoder (required):
shape (batch_size, embedding_dim, seq_len, frequency_len).
chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking. No used.
feature_mask: something that broadcasts with src, that we'll multiply `src`
by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim)
attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len),
interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
True means masked position. May be None.
src_key_padding_mask: the mask for padding, of shape (batch_size, seq_len); True means
masked position. May be None.
Returns: a Tensor with the same shape as src.
"""
# src: (b, c, t, f)
b, c, t, f = src.size()
src_f = src.permute(3, 0, 2, 1).contiguous().view(f, b * t, c)
src_t = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c)
pos_emb_f = self.encoder_pos(src_f)
pos_emb_t = self.encoder_pos(src_t)
output = src
if not torch.jit.is_scripting() and not torch.jit.is_tracing():
output = output * feature_mask
for i in range(len(self.f_layers)):
# output_org = output
# (b, c, t, f)
output_f_org = output.permute(3, 2, 0,
1).contiguous() # (f, t, b, c)
output_f = output_f_org.view(f, t * b, c)
# (f, t * b, c)
output_f = self.f_layers[i](
output_f,
pos_emb_f,
# chunk_size=chunk_size,
# attn_mask=attn_mask,
src_key_padding_mask=src_key_padding_mask,
)
output_f = output_f.view(f, t, b, c)
output_f = self.bypass_layers[i * 2](output_f_org, output_f)
# (f, t, b, c)
output = output_f.permute(2, 3, 1, 0).contiguous()
# (b, c, t, f)
# output = self.bypass_layers[i * 2](output_org, output)
# output_org = output
output_t_org = output.permute(2, 3, 0,
1).contiguous() # (t, f, b, c)
output_t = output_t_org.view(t, f * b, c)
output_t = self.t_layers[i](
output_t,
pos_emb_t,
# chunk_size=chunk_size,
# attn_mask=attn_mask,
src_key_padding_mask=src_key_padding_mask,
)
output_t = output_t.view(t, f, b, c)
output_t = self.bypass_layers[i * 2 + 1](output_t_org, output_t)
# (t, f, b, c)
output = output_t.permute(2, 3, 0, 1).contiguous()
# (b, c, t, f)
# output = self.bypass_layers[i * 2 + 1](output_org, output)
if not torch.jit.is_scripting() and not torch.jit.is_tracing():
output = output * feature_mask
return output
class DualPathDownsampledZipformer2Encoder(nn.Module):
r"""
DualPathDownsampledZipformer2Encoder is a dual-path zipformer encoder evaluated at a reduced frame rate,
after convolutional downsampling, and then upsampled again at the output, and combined
with the origin input, so that the output has the same shape as the input.
The features are downsampled-upsampled at the time and frequency domain.
"""
def __init__(self, encoder: nn.Module, dim: int, t_downsample: int,
f_downsample: int, dropout: FloatLike):
"""
Initialize the DualPathDownsampledZipformer2Encoder module with the specified
encoder, dimension, temporal and frequency downsampling factors r, and dropout rate.
"""
super(DualPathDownsampledZipformer2Encoder, self).__init__()
self.downsample_factor = t_downsample
self.t_downsample_factor = t_downsample
self.f_downsample_factor = f_downsample
if self.t_downsample_factor != 1:
self.downsample_t = SimpleDownsample(dim, t_downsample, dropout)
self.upsample_t = SimpleUpsample(dim, t_downsample)
if self.f_downsample_factor != 1:
self.downsample_f = SimpleDownsample(dim, f_downsample, dropout)
self.upsample_f = SimpleUpsample(dim, f_downsample)
# self.num_layers = encoder.num_layers
self.encoder = encoder
self.out_combiner = BypassModule(dim, straight_through_rate=0)
def forward(
self,
src: Tensor,
chunk_size: int = -1,
feature_mask: Union[Tensor, float] = 1.0,
attn_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
) -> Tensor:
r"""Downsample the input, process through the encoder, and then upsample back to the original shape.
Args:
src: the sequence to the encoder (required): shape (batch_size, embedding_dim, seq_len, frequency_len).
feature_mask: 1.0
attn_mask: None
src_key_padding_mask: None.
Returns: a Tensor with the same shape as src. (batch_size, embedding_dim, seq_len, frequency_len)
"""
# src: (b, c, t, f)
b, c, t, f = src.size()
# print(src.size())
src_orig = src.permute(2, 3, 0, 1) # (t, f, b, c)
# (b, c, t, f)
src = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c)
# -> (t, b * f, c)
if self.t_downsample_factor != 1:
src = self.downsample_t(src)
# (t//ds + 1, b * f, c)
downsample_t = src.size(0)
src = src.view(downsample_t, b, f,
c).permute(2, 1, 0,
3).contiguous().view(f, b * downsample_t, c)
# src = self.upsample_f(src)
if self.f_downsample_factor != 1:
src = self.downsample_f(src)
# (f//ds + 1, b * downsample_t, c)
downsample_f = src.size(0)
src = src.view(downsample_f, b, downsample_t, c).permute(1, 3, 2, 0)
# (b, c, downsample_t, downsample_f)
# print(src.size())
# ds = self.downsample_factor
# if attn_mask is not None:
# attn_mask = attn_mask[::ds, ::ds]
src = self.encoder(
src,
chunk_size=chunk_size,
feature_mask=feature_mask,
attn_mask=attn_mask,
src_key_padding_mask=src_key_padding_mask,
)
# (b, c, downsample_t, downsample_f)
src = src.permute(3, 0, 2,
1).contiguous().view(downsample_f, b * downsample_t,
c)
if self.f_downsample_factor != 1:
src = self.upsample_f(src)
# (f, b * downsample_t, c)
src = src[:f].view(f, b, downsample_t,
c).permute(2, 1, 0, 3).contiguous().view(
downsample_t, b * f, c)
# (downsample_t, b * f, c)
if self.t_downsample_factor != 1:
src = self.upsample_t(src)
# (t, b * f, c)
src = src[:t].view(t, b, f, c).permute(0, 2, 1, 3).contiguous()
# (t, f, b, c)
out = self.out_combiner(src_orig, src)
# (t, f, b, c)
out = out.permute(2, 3, 0, 1).contiguous()
# (b, c, t, f)
# print(out.size())
# remove any extra frames that are not a multiple of downsample_factor
# src = src[: src_orig.shape[0]] # slice here
return out
class Zipformer2DualPathEncoder(nn.Module):
def __init__(
self,
output_downsampling_factor: int = 2,
downsampling_factor: Tuple[int] = (2, 4),
f_downsampling_factor: Tuple[int] = None,
encoder_dim: Union[int, Tuple[int]] = 384,
num_encoder_layers: Union[int, Tuple[int]] = 4,
encoder_unmasked_dim: Union[int, Tuple[int]] = 256,
query_head_dim: Union[int, Tuple[int]] = 24,
pos_head_dim: Union[int, Tuple[int]] = 4,
value_head_dim: Union[int, Tuple[int]] = 12,
num_heads: Union[int, Tuple[int]] = 8,
feedforward_dim: Union[int, Tuple[int]] = 1536,
cnn_module_kernel: Union[int, Tuple[int]] = 31,
pos_dim: int = 192,
dropout: FloatLike = None, # see code below for default
warmup_batches: float = 4000.0,
causal: bool = False,
chunk_size: Tuple[int] = [-1],
left_context_frames: Tuple[int] = [-1],
):
"""
Initialize the Zipformer2DualPathEncoder module.
Zipformer2DualPathEncoder processes the hidden features of the noisy speech using dual-path modeling.
It has two kinds of blocks: DualPathZipformer2Encoder and DualPathDownsampledZipformer2Encoder.
DualPathZipformer2Encoder processes the 4D features with the shape of [B, C, T, F].
DualPathDownsampledZipformer2Encoder first downsamples the hidden features
and processes features using dual-path modeling like DualPathZipformer2Encoder.
Args:
Various hyperparameters and settings for the encoder.
"""
super(Zipformer2DualPathEncoder, self).__init__()
if dropout is None:
dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
def _to_tuple(x):
"""Converts a single int or a 1-tuple of an int to a tuple with the same length
as downsampling_factor"""
if isinstance(x, int):
x = (x, )
if len(x) == 1:
x = x * len(downsampling_factor)
else:
assert len(x) == len(downsampling_factor) and isinstance(
x[0], int)
return x
self.output_downsampling_factor = output_downsampling_factor # int
self.downsampling_factor = downsampling_factor # tuple
if f_downsampling_factor is None:
f_downsampling_factor = downsampling_factor
self.f_downsampling_factor = _to_tuple(f_downsampling_factor)
self.encoder_dim = encoder_dim = _to_tuple(encoder_dim) # tuple
self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple(
encoder_unmasked_dim) # tuple
num_encoder_layers = _to_tuple(num_encoder_layers)
self.num_encoder_layers = num_encoder_layers
self.query_head_dim = query_head_dim = _to_tuple(query_head_dim)
self.value_head_dim = value_head_dim = _to_tuple(value_head_dim)
pos_head_dim = _to_tuple(pos_head_dim)
self.num_heads = num_heads = _to_tuple(num_heads)
feedforward_dim = _to_tuple(feedforward_dim)
self.cnn_module_kernel = cnn_module_kernel = _to_tuple(
cnn_module_kernel)
self.causal = causal
self.chunk_size = chunk_size
self.left_context_frames = left_context_frames
for u, d in zip(encoder_unmasked_dim, encoder_dim):
assert u <= d
# each one will be Zipformer2Encoder or DownsampledZipformer2Encoder
encoders = []
num_encoders = len(downsampling_factor)
# "1,2,4,8,4,2",
for i in range(num_encoders):
encoder_layer = Zipformer2EncoderLayer(
embed_dim=encoder_dim[i],
pos_dim=pos_dim,
num_heads=num_heads[i],
query_head_dim=query_head_dim[i],
pos_head_dim=pos_head_dim[i],
value_head_dim=value_head_dim[i],
feedforward_dim=feedforward_dim[i],
dropout=dropout,
cnn_module_kernel=cnn_module_kernel[i],
causal=causal,
)
# For the segment of the warmup period, we let the Conv2dSubsampling
# layer learn something. Then we start to warm up the other encoders.
encoder = DualPathZipformer2Encoder(
encoder_layer,
num_encoder_layers[i],
pos_dim=pos_dim,
dropout=dropout,
warmup_begin=warmup_batches * (i + 1) / (num_encoders + 1),
warmup_end=warmup_batches * (i + 2) / (num_encoders + 1),
final_layerdrop_rate=0.035 * (downsampling_factor[i]**0.5),
bypass_layer=BypassModule(
encoder_dim[i], straight_through_rate=0),
)
if downsampling_factor[i] != 1 or f_downsampling_factor[i] != 1:
encoder = DualPathDownsampledZipformer2Encoder(
encoder,
dim=encoder_dim[i],
t_downsample=downsampling_factor[i],
f_downsample=f_downsampling_factor[i],
dropout=dropout,
)
encoders.append(encoder)
self.encoders = nn.ModuleList(encoders)
self.downsample_output = SimpleDownsample(
max(encoder_dim),
downsample=output_downsampling_factor,
dropout=dropout)
def forward(self, x):
"""
Forward pass of the Zipformer2DualPathEncoder module.
Args:
x (Tensor): Input tensor of shape [B, C, T, F].
Returns:
Tensor: Output tensor after passing through the encoder.
"""
outputs = []
# if torch.jit.is_scripting() or torch.jit.is_tracing():
# feature_masks = [1.0] * len(self.encoder_dim)
# else:
# feature_masks = self.get_feature_masks(x)
feature_masks = [1.0] * len(self.encoder_dim)
attn_mask = None
chunk_size = -1
# left_context_chunks = -1
for i, module in enumerate(self.encoders):
x = convert_num_channels(x, self.encoder_dim[i])
x = module(
x,
chunk_size=chunk_size,
feature_mask=feature_masks[i],
src_key_padding_mask=None,
attn_mask=attn_mask,
)
outputs.append(x)
# (b, c, t, f)
return x
if __name__ == '__main__':
# {2,2,2,2,2,2} {192,256,256,256,256,256} {512,768,768,768,768,768}
downsampling_factor = (1, 2, 4, 3) #
encoder_dim = (16, 32, 64, 64)
pos_dim = 48 # zipformer base设置
num_heads = (4, 4, 4, 4) # "4,4,4,8,4,4"
query_head_dim = (16, ) * len(downsampling_factor) # 32
pos_head_dim = (4, ) * len(downsampling_factor) # 4
value_head_dim = (12, ) * len(downsampling_factor) # 12
feedforward_dim = (32, 64, 128, 128) #
dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
cnn_module_kernel = (15, ) * len(downsampling_factor) # 31,31,15,15,15,31
causal = False
encoder_unmasked_dim = (16, ) * len(downsampling_factor)
num_encoder_layers = (1, 1, 1, 1)
warmup_batches = 4000.0
net = Zipformer2DualPathEncoder(
output_downsampling_factor=1,
downsampling_factor=downsampling_factor,
num_encoder_layers=num_encoder_layers,
encoder_dim=encoder_dim,
encoder_unmasked_dim=encoder_unmasked_dim,
query_head_dim=query_head_dim,
pos_head_dim=pos_head_dim,
value_head_dim=value_head_dim,
pos_dim=pos_dim,
num_heads=num_heads,
feedforward_dim=feedforward_dim,
cnn_module_kernel=cnn_module_kernel,
dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
warmup_batches=warmup_batches,
causal=causal,
)
# net = DownsampledZipformer2Encoder(
# None, 128, 2, 0.
# )
# x = torch.randn((101, 2, 128))
b = 4
t = 321
f = 101
c = 64
# x = torch.randn((101, 2, 128))
x = torch.randn((b, c, t, f))
x = net(x)
print(x.size())

File diff suppressed because it is too large Load Diff

View File

@@ -122,3 +122,127 @@ class ANSPipeline(Pipeline):
np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16),
self.SAMPLE_RATE)
return inputs
@PIPELINES.register_module(
Tasks.acoustic_noise_suppression,
module_name=Pipelines.speech_zipenhancer_ans_multiloss_16k_base)
class ANSZipEnhancerPipeline(Pipeline):
r"""ANS (Acoustic Noise Suppression) Inference Pipeline .
When invoke the class with pipeline.__call__(), it accept only one parameter:
inputs(str): the path of wav file
"""
SAMPLE_RATE = 16000
def __init__(self, model, **kwargs):
"""
use `model` and `preprocessor` to create a kws pipeline for prediction
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model, **kwargs)
self.model.eval()
self.stream_mode = kwargs.get('stream_mode', False)
def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
if self.stream_mode:
raise TypeError('This model does not support stream mode!')
if isinstance(inputs, bytes):
data1, fs = sf.read(io.BytesIO(inputs))
elif isinstance(inputs, str):
# file_bytes = File.read(inputs)
# data1, fs = sf.read(io.BytesIO(file_bytes))
data1, fs = sf.read(inputs)
else:
raise TypeError(f'Unsupported type {type(inputs)}.')
if len(data1.shape) > 1:
data1 = data1[:, 0]
if fs != self.SAMPLE_RATE:
data1 = librosa.resample(
data1, orig_sr=fs, target_sr=self.SAMPLE_RATE)
data1 = audio_norm(data1)
data = data1.astype(np.float32)
inputs = np.reshape(data, [1, data.shape[0]])
return {'ndarray': inputs, 'nsamples': data.shape[0]}
def forward(self, inputs: Dict[str, Any],
**forward_params) -> Dict[str, Any]:
ndarray = inputs['ndarray']
if isinstance(ndarray, torch.Tensor):
ndarray = ndarray.cpu().numpy()
nsamples = inputs['nsamples']
decode_do_segement = False
window = 16000 * 2 # 2s
stride = int(window * 0.75)
print('inputs:{}'.format(ndarray.shape))
b, t = ndarray.shape # size()
if t > window * 5: # 10s
decode_do_segement = True
print('decode_do_segement')
if t < window:
ndarray = np.concatenate(
[ndarray, np.zeros((ndarray.shape[0], window - t))], 1)
elif decode_do_segement:
if t < window + stride:
padding = window + stride - t
print('padding: {}'.format(padding))
ndarray = np.concatenate(
[ndarray, np.zeros((ndarray.shape[0], padding))], 1)
else:
if (t - window) % stride != 0:
# padding = t - (t - window) // stride * stride
padding = (
(t - window) // stride + 1) * stride + window - t
print('padding: {}'.format(padding))
ndarray = np.concatenate(
[ndarray,
np.zeros((ndarray.shape[0], padding))], 1)
# else:
# if (t - window) % stride != 0:
# padding = t - (t - window) // stride * stride
# print('padding: {}'.format(padding))
# ndarray = np.concatenate(
# [ndarray, np.zeros((ndarray.shape[0], padding))], 1)
print('inputs after padding:{}'.format(ndarray.shape))
with torch.no_grad():
ndarray = torch.from_numpy(np.float32(ndarray)).to(self.device)
b, t = ndarray.shape
if decode_do_segement:
outputs = np.zeros(t)
give_up_length = (window - stride) // 2
current_idx = 0
while current_idx + window <= t:
# print('current_idx: {}'.format(current_idx))
print(
'\rcurrent_idx: {} {:.2f}%'.format(
current_idx, current_idx * 100 / t),
end='')
tmp_input = dict(noisy=ndarray[:, current_idx:current_idx
+ window])
tmp_output = self.model(
tmp_input, )['wav_l2'][0].cpu().numpy()
end_index = current_idx + window - give_up_length
if current_idx == 0:
outputs[current_idx:
end_index] = tmp_output[:-give_up_length]
else:
outputs[current_idx
+ give_up_length:end_index] = tmp_output[
give_up_length:-give_up_length]
current_idx += stride
print('\rcurrent_idx: {} {:.2f}%'.format(current_idx, 100))
else:
outputs = self.model(
dict(noisy=ndarray))['wav_l2'][0].cpu().numpy()
outputs = (outputs[:nsamples] * 32768).astype(np.int16).tobytes()
return {OutputKeys.OUTPUT_PCM: outputs}
def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
if 'output_path' in kwargs.keys():
sf.write(
kwargs['output_path'],
np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16),
self.SAMPLE_RATE)
return inputs

View File

@@ -1,6 +1,6 @@
addict
attrs
datasets>=3.0.0
datasets>=3.0.0,<=3.0.1
einops
oss2
Pillow

View File

@@ -1,6 +1,6 @@
addict
attrs
datasets>=3.0.0
datasets>=3.0.0,<=3.0.1
einops
oss2
Pillow

View File

@@ -150,6 +150,36 @@ class SpeechSignalProcessTest(unittest.TestCase):
w.write(pcm)
audio = f.read(block_size)
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_zipenhancer_ans(self):
model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
output_path = os.path.abspath('output.wav')
ans(os.path.join(os.getcwd(), NOISE_SPEECH_FILE),
output_path=output_path)
print(f'Processed audio saved to {output_path}')
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_zipenhancer_ans_url(self):
model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
output_path = os.path.abspath('output.wav')
ans(NOISE_SPEECH_URL, output_path=output_path)
print(f'Processed audio saved to {output_path}')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_zipenhancer_ans_bytes(self):
model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
ans = pipeline(
Tasks.acoustic_noise_suppression,
model=model_id,
pipeline_name=Pipelines.speech_zipenhancer_ans_multiloss_16k_base)
output_path = os.path.abspath('output.wav')
with open(os.path.join(os.getcwd(), NOISE_SPEECH_FILE), 'rb') as f:
data = f.read()
ans(data, output_path=output_path)
print(f'Processed audio saved to {output_path}')
if __name__ == '__main__':
unittest.main()