mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 20:19:51 +01:00
Merge commit 'e71057cacf6b9f977e20f41e6d767b4ad1557171'
* commit 'e71057cacf6b9f977e20f41e6d767b4ad1557171': feat(audio/ans): Add ZipEnhancer and related layers for acoustic nois… (#1019) Fix timestamp in docker build (#1049) Fix pypi mirror (#1048) Fix build error (#1047) hotfix for datasets 3.0.2 (#1046) Update docker scripts (#1044) Add docker workflow name (#1043)
This commit is contained in:
16
.github/workflows/docker-image.yml
vendored
16
.github/workflows/docker-image.yml
vendored
@@ -1,24 +1,30 @@
|
||||
name: Build Docker Images
|
||||
name: Build Docker Image
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
workflow_name:
|
||||
description: 'The specific name of this build'
|
||||
required: true
|
||||
default: 'build'
|
||||
modelscope_branch:
|
||||
description: 'ModelScope branch to build from'
|
||||
description: 'ModelScope branch to build from(release/x.xx)'
|
||||
required: true
|
||||
image_type:
|
||||
description: 'The image type to build'
|
||||
description: 'The image type to build(cpu/gpu/llm)'
|
||||
required: true
|
||||
modelscope_version:
|
||||
description: 'ModelScope version to use'
|
||||
description: 'ModelScope version to use(x.xx.x)'
|
||||
required: true
|
||||
swift_branch:
|
||||
description: 'SWIFT branch to use'
|
||||
description: 'SWIFT branch to use(release/x.xx)'
|
||||
required: true
|
||||
other_params:
|
||||
description: 'Other params in --xxx xxx'
|
||||
required: false
|
||||
|
||||
run-name: Docker-${{ inputs.modelscope_branch }}-${{ inputs.image_type }}-${{ inputs.workflow_name }}-by-@${{ github.actor }}
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: [modelscope-self-hosted-us]
|
||||
|
||||
@@ -12,10 +12,6 @@ RUN apt-get update && \
|
||||
|
||||
{extra_content}
|
||||
|
||||
RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip config set install.trusted-host mirrors.aliyun.com && \
|
||||
cp /tmp/resources/ubuntu2204.aliyun /etc/apt/sources.list
|
||||
|
||||
COPY {meta_file} /tmp/install.sh
|
||||
|
||||
RUN sh /tmp/install.sh {version_args}
|
||||
@@ -28,6 +24,10 @@ RUN cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b {modelscope_branch} --single
|
||||
|
||||
RUN cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b {swift_branch} --single-branch https://github.com/modelscope/ms-swift.git && cd ms-swift && pip install .[all] && cd / && rm -fr /tmp/ms-swift && pip cache purge;
|
||||
|
||||
RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
|
||||
pip config set install.trusted-host mirrors.aliyun.com && \
|
||||
cp /tmp/resources/ubuntu2204.aliyun /etc/apt/sources.list
|
||||
|
||||
ENV SETUPTOOLS_USE_DISTUTILS=stdlib
|
||||
ENV VLLM_USE_MODELSCOPE=True
|
||||
ENV LMDEPLOY_USE_MODELSCOPE=True
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import argparse
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
docker_registry = os.environ['DOCKER_REGISTRY']
|
||||
assert docker_registry, 'You must pass a valid DOCKER_REGISTRY'
|
||||
timestamp = datetime.now()
|
||||
formatted_time = timestamp.strftime('%Y%m%d%H%M%S')
|
||||
|
||||
|
||||
class Builder:
|
||||
@@ -85,12 +88,16 @@ class BaseCPUImageBuilder(Builder):
|
||||
return content
|
||||
|
||||
def build(self):
|
||||
image_tag = f'{docker_registry}:ubuntu{self.args.ubuntu_version}-torch{self.args.torch_version}-base'
|
||||
image_tag = (
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
|
||||
f'torch{self.args.torch_version}-base')
|
||||
return os.system(
|
||||
f'DOCKER_BUILDKIT=0 docker build -t {image_tag} -f Dockerfile .')
|
||||
|
||||
def push(self):
|
||||
image_tag = f'{docker_registry}:ubuntu{self.args.ubuntu_version}-torch{self.args.torch_version}-base'
|
||||
image_tag = (
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
|
||||
f'torch{self.args.torch_version}-base')
|
||||
return os.system(f'docker push {image_tag}')
|
||||
|
||||
|
||||
@@ -110,14 +117,14 @@ class BaseGPUImageBuilder(Builder):
|
||||
|
||||
def build(self) -> int:
|
||||
image_tag = (
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
|
||||
f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
|
||||
return os.system(
|
||||
f'DOCKER_BUILDKIT=0 docker build -t {image_tag} -f Dockerfile .')
|
||||
|
||||
def push(self):
|
||||
image_tag = (
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
|
||||
f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
|
||||
return os.system(f'docker push {image_tag}')
|
||||
|
||||
@@ -129,7 +136,9 @@ class CPUImageBuilder(Builder):
|
||||
version_args = (
|
||||
f'{self.args.torch_version} {self.args.torchvision_version} '
|
||||
f'{self.args.torchaudio_version}')
|
||||
base_image = f'{docker_registry}:ubuntu{self.args.ubuntu_version}-torch{self.args.torch_version}-base'
|
||||
base_image = (
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}'
|
||||
f'-torch{self.args.torch_version}-base')
|
||||
extra_content = """\nRUN pip install adaseq\nRUN pip install pai-easycv"""
|
||||
|
||||
with open('docker/Dockerfile.ubuntu', 'r') as f:
|
||||
@@ -157,7 +166,17 @@ class CPUImageBuilder(Builder):
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
|
||||
f'torch{self.args.torch_version}-{self.args.modelscope_version}-test'
|
||||
)
|
||||
return os.system(f'docker push {image_tag}')
|
||||
ret = os.system(f'docker push {image_tag}')
|
||||
if ret != 0:
|
||||
return ret
|
||||
image_tag2 = (
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
|
||||
f'torch{self.args.torch_version}-{self.args.modelscope_version}-{formatted_time}-test'
|
||||
)
|
||||
ret = os.system(f'docker tag {image_tag} {image_tag2}')
|
||||
if ret != 0:
|
||||
return ret
|
||||
return os.system(f'docker push {image_tag2}')
|
||||
|
||||
|
||||
class GPUImageBuilder(Builder):
|
||||
@@ -170,7 +189,7 @@ class GPUImageBuilder(Builder):
|
||||
f'{self.args.vllm_version} {self.args.lmdeploy_version} {self.args.autogptq_version}'
|
||||
)
|
||||
base_image = (
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
|
||||
f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
|
||||
with open('docker/Dockerfile.ubuntu', 'r') as f:
|
||||
content = f.read()
|
||||
@@ -196,7 +215,17 @@ class GPUImageBuilder(Builder):
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
|
||||
f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-'
|
||||
f'{self.args.modelscope_version}-test')
|
||||
return os.system(f'docker push {image_tag}')
|
||||
ret = os.system(f'docker push {image_tag}')
|
||||
if ret != 0:
|
||||
return ret
|
||||
image_tag2 = (
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
|
||||
f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-'
|
||||
f'{self.args.modelscope_version}-{formatted_time}-test')
|
||||
ret = os.system(f'docker tag {image_tag} {image_tag2}')
|
||||
if ret != 0:
|
||||
return ret
|
||||
return os.system(f'docker push {image_tag2}')
|
||||
|
||||
|
||||
class LLMImageBuilder(Builder):
|
||||
@@ -253,7 +282,17 @@ class LLMImageBuilder(Builder):
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
|
||||
f'{self.args.python_tag}-torch{self.args.torch_version}-{self.args.modelscope_version}-LLM-test'
|
||||
)
|
||||
return os.system(f'docker push {image_tag}')
|
||||
ret = os.system(f'docker push {image_tag}')
|
||||
if ret != 0:
|
||||
return ret
|
||||
image_tag2 = (
|
||||
f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
|
||||
f'{self.args.python_tag}-torch{self.args.torch_version}-'
|
||||
f'{self.args.modelscope_version}-LLM-{formatted_time}-test')
|
||||
ret = os.system(f'docker tag {image_tag} {image_tag2}')
|
||||
if ret != 0:
|
||||
return ret
|
||||
return os.system(f'docker push {image_tag2}')
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
@@ -193,6 +193,7 @@ class Models(object):
|
||||
# audio models
|
||||
sambert_hifigan = 'sambert-hifigan'
|
||||
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
|
||||
speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base'
|
||||
speech_dfsmn_ans = 'speech_dfsmn_ans'
|
||||
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
|
||||
speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot'
|
||||
@@ -551,6 +552,7 @@ class Pipelines(object):
|
||||
sambert_hifigan_tts = 'sambert-hifigan-tts'
|
||||
speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
|
||||
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
|
||||
speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base'
|
||||
speech_dfsmn_ans_psm_48k_causal = 'speech_dfsmn_ans_psm_48k_causal'
|
||||
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
|
||||
speech_separation = 'speech-separation'
|
||||
|
||||
210
modelscope/models/audio/ans/zipenhancer.py
Normal file
210
modelscope/models/audio/ans/zipenhancer.py
Normal file
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
import random
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import TorchModel
|
||||
from modelscope.models.base import Tensor
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from .zipenhancer_layers.generator import (DenseEncoder, MappingDecoder,
|
||||
PhaseDecoder)
|
||||
from .zipenhancer_layers.scaling import ScheduledFloat
|
||||
from .zipenhancer_layers.zipenhancer_layer import Zipformer2DualPathEncoder
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.acoustic_noise_suppression,
|
||||
module_name=Models.speech_zipenhancer_ans_multiloss_16k_base)
|
||||
class ZipenhancerDecorator(TorchModel):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
|
||||
h = dict(
|
||||
num_tsconformers=kwargs['num_tsconformers'],
|
||||
dense_channel=kwargs['dense_channel'],
|
||||
former_conf=kwargs['former_conf'],
|
||||
batch_first=kwargs['batch_first'],
|
||||
model_num_spks=kwargs['model_num_spks'],
|
||||
)
|
||||
# num_tsconformers, dense_channel, former_name, former_conf, batch_first, model_num_spks
|
||||
|
||||
h = AttrDict(h)
|
||||
self.model = ZipEnhancer(h)
|
||||
model_bin_file = os.path.join(model_dir,
|
||||
ModelFile.TORCH_MODEL_BIN_FILE)
|
||||
if os.path.exists(model_bin_file):
|
||||
checkpoint = torch.load(
|
||||
model_bin_file, map_location=torch.device('cpu'))
|
||||
if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
|
||||
# the new trained model by user is based on ZipenhancerDecorator
|
||||
self.load_state_dict(checkpoint['state_dict'])
|
||||
else:
|
||||
# The released model on Modelscope is based on Zipenhancer
|
||||
# self.model.load_state_dict(checkpoint, strict=False)
|
||||
self.model.load_state_dict(checkpoint['generator'])
|
||||
# print(checkpoint['generator'].keys())
|
||||
|
||||
def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
n_fft = 400
|
||||
hop_size = 100
|
||||
win_size = 400
|
||||
noisy_wav = inputs['noisy']
|
||||
norm_factor = torch.sqrt(noisy_wav.shape[1]
|
||||
/ torch.sum(noisy_wav**2.0))
|
||||
noisy_audio = (noisy_wav * norm_factor)
|
||||
|
||||
mag, pha, com = mag_pha_stft(
|
||||
noisy_audio,
|
||||
n_fft,
|
||||
hop_size,
|
||||
win_size,
|
||||
compress_factor=0.3,
|
||||
center=True)
|
||||
amp_g, pha_g, com_g, _, others = self.model.forward(mag, pha)
|
||||
wav = mag_pha_istft(
|
||||
amp_g,
|
||||
pha_g,
|
||||
n_fft,
|
||||
hop_size,
|
||||
win_size,
|
||||
compress_factor=0.3,
|
||||
center=True)
|
||||
|
||||
wav = wav / norm_factor
|
||||
|
||||
output = {
|
||||
'wav_l2': wav,
|
||||
}
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class ZipEnhancer(nn.Module):
|
||||
|
||||
def __init__(self, h):
|
||||
"""
|
||||
Initialize the ZipEnhancer module.
|
||||
|
||||
Args:
|
||||
h (object): Configuration object containing various hyperparameters and settings.
|
||||
having num_tsconformers, former_name, former_conf, mask_decoder_type, ...
|
||||
"""
|
||||
super(ZipEnhancer, self).__init__()
|
||||
self.h = h
|
||||
|
||||
num_tsconformers = h.num_tsconformers
|
||||
self.num_tscblocks = num_tsconformers
|
||||
self.dense_encoder = DenseEncoder(h, in_channel=2)
|
||||
|
||||
self.TSConformer = Zipformer2DualPathEncoder(
|
||||
output_downsampling_factor=1,
|
||||
dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
|
||||
**h.former_conf)
|
||||
|
||||
self.mask_decoder = MappingDecoder(h, out_channel=h.model_num_spks)
|
||||
self.phase_decoder = PhaseDecoder(h, out_channel=h.model_num_spks)
|
||||
|
||||
def forward(self, noisy_mag, noisy_pha): # [B, F, T]
|
||||
"""
|
||||
Forward pass of the ZipEnhancer module.
|
||||
|
||||
Args:
|
||||
noisy_mag (Tensor): Noisy magnitude input tensor of shape [B, F, T].
|
||||
noisy_pha (Tensor): Noisy phase input tensor of shape [B, F, T].
|
||||
|
||||
Returns:
|
||||
Tuple: denoised magnitude, denoised phase, denoised complex representation,
|
||||
(optional) predicted noise components, and other auxiliary information.
|
||||
"""
|
||||
others = dict()
|
||||
|
||||
noisy_mag = noisy_mag.unsqueeze(-1).permute(0, 3, 2, 1) # [B, 1, T, F]
|
||||
noisy_pha = noisy_pha.unsqueeze(-1).permute(0, 3, 2, 1) # [B, 1, T, F]
|
||||
x = torch.cat((noisy_mag, noisy_pha), dim=1) # [B, 2, T, F]
|
||||
x = self.dense_encoder(x)
|
||||
|
||||
# [B, C, T, F]
|
||||
x = self.TSConformer(x)
|
||||
|
||||
pred_mag = self.mask_decoder(x)
|
||||
pred_pha = self.phase_decoder(x)
|
||||
# b, c, t, f -> b, 1, t, f -> b, f, t, 1 -> b, f, t
|
||||
denoised_mag = pred_mag[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
|
||||
1).squeeze(-1)
|
||||
|
||||
# b, t, f
|
||||
denoised_pha = pred_pha[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
|
||||
1).squeeze(-1)
|
||||
# b, t, f
|
||||
denoised_com = torch.stack((denoised_mag * torch.cos(denoised_pha),
|
||||
denoised_mag * torch.sin(denoised_pha)),
|
||||
dim=-1)
|
||||
|
||||
return denoised_mag, denoised_pha, denoised_com, None, others
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
self.__dict__ = self
|
||||
|
||||
|
||||
def mag_pha_stft(y,
|
||||
n_fft,
|
||||
hop_size,
|
||||
win_size,
|
||||
compress_factor=1.0,
|
||||
center=True):
|
||||
hann_window = torch.hann_window(win_size, device=y.device)
|
||||
stft_spec = torch.stft(
|
||||
y,
|
||||
n_fft,
|
||||
hop_length=hop_size,
|
||||
win_length=win_size,
|
||||
window=hann_window,
|
||||
center=center,
|
||||
pad_mode='reflect',
|
||||
normalized=False,
|
||||
return_complex=True)
|
||||
stft_spec = torch.view_as_real(stft_spec)
|
||||
mag = torch.sqrt(stft_spec.pow(2).sum(-1) + (1e-9))
|
||||
pha = torch.atan2(stft_spec[:, :, :, 1], stft_spec[:, :, :, 0] + (1e-5))
|
||||
# Magnitude Compression
|
||||
mag = torch.pow(mag, compress_factor)
|
||||
com = torch.stack((mag * torch.cos(pha), mag * torch.sin(pha)), dim=-1)
|
||||
|
||||
return mag, pha, com
|
||||
|
||||
|
||||
def mag_pha_istft(mag,
|
||||
pha,
|
||||
n_fft,
|
||||
hop_size,
|
||||
win_size,
|
||||
compress_factor=1.0,
|
||||
center=True):
|
||||
# Magnitude Decompression
|
||||
mag = torch.pow(mag, (1.0 / compress_factor))
|
||||
com = torch.complex(mag * torch.cos(pha), mag * torch.sin(pha))
|
||||
hann_window = torch.hann_window(win_size, device=com.device)
|
||||
|
||||
wav = torch.istft(
|
||||
com,
|
||||
n_fft,
|
||||
hop_length=hop_size,
|
||||
win_length=win_size,
|
||||
window=hann_window,
|
||||
center=center)
|
||||
return wav
|
||||
220
modelscope/models/audio/ans/zipenhancer_layers/generator.py
Normal file
220
modelscope/models/audio/ans/zipenhancer_layers/generator.py
Normal file
@@ -0,0 +1,220 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Part of the implementation is borrowed and modified from MP-SENet,
|
||||
# public available at https://github.com/yxlu-0102/MP-SENet
|
||||
|
||||
import random
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class SubPixelConvTranspose2d(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=(1, 3),
|
||||
stride=(1, 2),
|
||||
padding=(0, 1)):
|
||||
super(SubPixelConvTranspose2d, self).__init__()
|
||||
self.upscale_width_factor = stride[1]
|
||||
self.conv1 = nn.Conv2d(
|
||||
in_channels,
|
||||
out_channels * self.upscale_width_factor,
|
||||
kernel_size=kernel_size,
|
||||
padding=padding) # only change the width
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
b, c, t, f = x.size()
|
||||
# Use conv1 for upsampling, followed by expansion only in the width dimension.
|
||||
x = self.conv1(x)
|
||||
# print(x.size())
|
||||
# Note: Here we do not directly use PixelShuffle because we only intend to expand in the width dimension,
|
||||
# whereas PixelShuffle operates simultaneously on both height and width, hence we manually adjust accordingly.
|
||||
# b, 2c, t, f
|
||||
# print(x.size())
|
||||
x = x.view(b, c, self.upscale_width_factor, t,
|
||||
f).permute(0, 1, 3, 4, 2).contiguous()
|
||||
# b, c, 2, t, f -> b, c, t, f, 2
|
||||
x = x.view(b, c, t, f * self.upscale_width_factor)
|
||||
# b, c, t, 2f = 202
|
||||
# x = nn.functional.pad(x, (0, 1))
|
||||
# b, c, t, 2f = 202
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class DenseBlockV2(nn.Module):
|
||||
"""
|
||||
A denseblock for ZipEnhancer
|
||||
"""
|
||||
|
||||
def __init__(self, h, kernel_size=(2, 3), depth=4):
|
||||
super(DenseBlockV2, self).__init__()
|
||||
self.h = h
|
||||
self.depth = depth
|
||||
self.dense_block = nn.ModuleList([])
|
||||
for i in range(depth):
|
||||
dil = 2**i
|
||||
pad_length = kernel_size[0] + (dil - 1) * (kernel_size[0] - 1) - 1
|
||||
dense_conv = nn.Sequential(
|
||||
nn.ConstantPad2d((1, 1, pad_length, 0), value=0.),
|
||||
nn.Conv2d(
|
||||
h.dense_channel * (i + 1),
|
||||
h.dense_channel,
|
||||
kernel_size,
|
||||
dilation=(dil, 1)),
|
||||
# nn.Conv2d(h.dense_channel * (i + 1), h.dense_channel, kernel_size, dilation=(dil, 1),
|
||||
# padding=get_padding_2d(kernel_size, (dil, 1))),
|
||||
nn.InstanceNorm2d(h.dense_channel, affine=True),
|
||||
nn.PReLU(h.dense_channel))
|
||||
self.dense_block.append(dense_conv)
|
||||
|
||||
def forward(self, x):
|
||||
skip = x
|
||||
# b, c, t, f
|
||||
for i in range(self.depth):
|
||||
_x = skip
|
||||
x = self.dense_block[i](_x)
|
||||
# print(x.size())
|
||||
skip = torch.cat([x, skip], dim=1)
|
||||
return x
|
||||
|
||||
|
||||
class DenseEncoder(nn.Module):
|
||||
|
||||
def __init__(self, h, in_channel):
|
||||
"""
|
||||
Initialize the DenseEncoder module.
|
||||
|
||||
Args:
|
||||
h (object): Configuration object containing various hyperparameters and settings.
|
||||
in_channel (int): Number of input channels. Example: mag + phase: 2 channels
|
||||
"""
|
||||
super(DenseEncoder, self).__init__()
|
||||
self.h = h
|
||||
self.dense_conv_1 = nn.Sequential(
|
||||
nn.Conv2d(in_channel, h.dense_channel, (1, 1)),
|
||||
nn.InstanceNorm2d(h.dense_channel, affine=True),
|
||||
nn.PReLU(h.dense_channel))
|
||||
|
||||
self.dense_block = DenseBlockV2(h, depth=4)
|
||||
|
||||
encoder_pad_kersize = (0, 1)
|
||||
# Here pad was originally (0, 0),now change to (0, 1)
|
||||
self.dense_conv_2 = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
h.dense_channel,
|
||||
h.dense_channel, (1, 3), (1, 2),
|
||||
padding=encoder_pad_kersize),
|
||||
nn.InstanceNorm2d(h.dense_channel, affine=True),
|
||||
nn.PReLU(h.dense_channel))
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Forward pass of the DenseEncoder module.
|
||||
|
||||
Args:
|
||||
x (Tensor): Input tensor of shape [B, C=in_channel, T, F].
|
||||
|
||||
Returns:
|
||||
Tensor: Output tensor after passing through the dense encoder. Maybe: [B, C=dense_channel, T, F // 2].
|
||||
"""
|
||||
# print("x: {}".format(x.size()))
|
||||
x = self.dense_conv_1(x) # [b, 64, T, F]
|
||||
if self.dense_block is not None:
|
||||
x = self.dense_block(x) # [b, 64, T, F]
|
||||
x = self.dense_conv_2(x) # [b, 64, T, F//2]
|
||||
return x
|
||||
|
||||
|
||||
class BaseDecoder(nn.Module):
|
||||
|
||||
def __init__(self, h):
|
||||
"""
|
||||
Initialize the BaseDecoder module.
|
||||
|
||||
Args:
|
||||
h (object): Configuration object containing various hyperparameters and settings.
|
||||
including upsample_type, dense_block_type.
|
||||
"""
|
||||
super(BaseDecoder, self).__init__()
|
||||
|
||||
self.upsample_module_class = SubPixelConvTranspose2d
|
||||
|
||||
# for both mag and phase decoder
|
||||
self.dense_block = DenseBlockV2(h, depth=4)
|
||||
|
||||
|
||||
class MappingDecoder(BaseDecoder):
|
||||
|
||||
def __init__(self, h, out_channel=1):
|
||||
"""
|
||||
Initialize the MappingDecoderV3 module.
|
||||
|
||||
Args:
|
||||
h (object): Configuration object containing various hyperparameters and settings.
|
||||
out_channel (int): Number of output channels. Default is 1. The number of output spearkers.
|
||||
"""
|
||||
super(MappingDecoder, self).__init__(h)
|
||||
decoder_final_kersize = (1, 2)
|
||||
|
||||
self.mask_conv = nn.Sequential(
|
||||
self.upsample_module_class(h.dense_channel, h.dense_channel,
|
||||
(1, 3), (1, 2)),
|
||||
# nn.Conv2d(h.dense_channel, out_channel, (1, 1)),
|
||||
nn.InstanceNorm2d(h.dense_channel, affine=True),
|
||||
nn.PReLU(h.dense_channel),
|
||||
nn.Conv2d(h.dense_channel, out_channel, decoder_final_kersize))
|
||||
# Upsample at F dimension
|
||||
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Forward pass of the MappingDecoderV3 module.
|
||||
|
||||
Args:
|
||||
x (Tensor): Input tensor. [B, C, T, F]
|
||||
|
||||
Returns:
|
||||
Tensor: Output tensor after passing through the decoder. [B, Num_Spks, T, F]
|
||||
"""
|
||||
if self.dense_block is not None:
|
||||
x = self.dense_block(x)
|
||||
x = self.mask_conv(x)
|
||||
x = self.relu(x)
|
||||
# b, c=1, t, f
|
||||
return x
|
||||
|
||||
|
||||
class PhaseDecoder(BaseDecoder):
|
||||
|
||||
def __init__(self, h, out_channel=1):
|
||||
super(PhaseDecoder, self).__init__(h)
|
||||
|
||||
# now change to (1, 2), previous (1, 1)
|
||||
decoder_final_kersize = (1, 2)
|
||||
|
||||
self.phase_conv = nn.Sequential(
|
||||
self.upsample_module_class(h.dense_channel, h.dense_channel,
|
||||
(1, 3), (1, 2)),
|
||||
nn.InstanceNorm2d(h.dense_channel, affine=True),
|
||||
nn.PReLU(h.dense_channel))
|
||||
self.phase_conv_r = nn.Conv2d(h.dense_channel, out_channel,
|
||||
decoder_final_kersize)
|
||||
self.phase_conv_i = nn.Conv2d(h.dense_channel, out_channel,
|
||||
decoder_final_kersize)
|
||||
|
||||
def forward(self, x):
|
||||
if self.dense_block is not None:
|
||||
x = self.dense_block(x)
|
||||
x = self.phase_conv(x)
|
||||
x_r = self.phase_conv_r(x)
|
||||
x_i = self.phase_conv_i(x)
|
||||
x = torch.atan2(x_i, x_r)
|
||||
return x
|
||||
1055
modelscope/models/audio/ans/zipenhancer_layers/scaling.py
Normal file
1055
modelscope/models/audio/ans/zipenhancer_layers/scaling.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,501 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import copy
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
|
||||
from .scaling import FloatLike, ScheduledFloat, convert_num_channels
|
||||
from .zipformer import (BypassModule, CompactRelPositionalEncoding,
|
||||
SimpleDownsample, SimpleUpsample,
|
||||
Zipformer2EncoderLayer)
|
||||
|
||||
|
||||
class DualPathZipformer2Encoder(nn.Module):
|
||||
r"""DualPathZipformer2Encoder is a stack of N encoder layers
|
||||
it has two kinds of EncoderLayer including F_Zipformer2EncoderLayer and T_Zipformer2EncoderLayer
|
||||
the features are modeling with the shape of
|
||||
[B, C, T, F] -> [F, T * B, C] -> -> [B, C, T, F] -> [T, F * B, C] -> [B, C, T, F]
|
||||
|
||||
Args:
|
||||
encoder_layer: an instance of the Zipformer2EncoderLayer() class (required).
|
||||
num_layers: the number of sub-encoder-layers in the encoder (required).
|
||||
pos_dim: the dimension for the relative positional encoding
|
||||
|
||||
Examples::
|
||||
>>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8)
|
||||
>>> dualpath_zipformer_encoder = DualPathZipformer2Encoder(encoder_layer, num_layers=6)
|
||||
>>> src = torch.rand(10, 512, 161, 101)
|
||||
>>> out = dualpath_zipformer_encoder(src)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
encoder_layer: nn.Module,
|
||||
num_layers: int,
|
||||
pos_dim: int,
|
||||
dropout: float,
|
||||
warmup_begin: float,
|
||||
warmup_end: float,
|
||||
initial_layerdrop_rate: float = 0.5,
|
||||
final_layerdrop_rate: float = 0.05,
|
||||
bypass_layer=None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the DualPathZipformer2Encoder module with the specified
|
||||
encoder layer, number of layers, positional dimension, dropout rate, warmup period, and layer drop rates.
|
||||
"""
|
||||
super().__init__()
|
||||
self.encoder_pos = CompactRelPositionalEncoding(
|
||||
pos_dim, dropout_rate=0.15, length_factor=1.0)
|
||||
|
||||
self.f_layers = nn.ModuleList(
|
||||
[copy.deepcopy(encoder_layer) for i in range(num_layers)])
|
||||
self.t_layers = nn.ModuleList(
|
||||
[copy.deepcopy(encoder_layer) for i in range(num_layers)])
|
||||
self.bypass_layers = nn.ModuleList(
|
||||
[bypass_layer for i in range(num_layers * 2)])
|
||||
self.num_layers = num_layers
|
||||
|
||||
assert 0 <= warmup_begin <= warmup_end, (warmup_begin, warmup_end)
|
||||
|
||||
delta = (1.0 / num_layers) * (warmup_end - warmup_begin)
|
||||
cur_begin = warmup_begin # interpreted as a training batch index
|
||||
for i in range(num_layers):
|
||||
cur_end = cur_begin + delta
|
||||
self.f_layers[i].bypass.skip_rate = ScheduledFloat(
|
||||
(cur_begin, initial_layerdrop_rate),
|
||||
(cur_end, final_layerdrop_rate),
|
||||
default=0.0,
|
||||
)
|
||||
self.t_layers[i].bypass.skip_rate = ScheduledFloat(
|
||||
(cur_begin, initial_layerdrop_rate),
|
||||
(cur_end, final_layerdrop_rate),
|
||||
default=0.0,
|
||||
)
|
||||
cur_begin = cur_end
|
||||
|
||||
def forward(
|
||||
self,
|
||||
src: Tensor,
|
||||
chunk_size: int = -1,
|
||||
feature_mask: Union[Tensor, float] = 1.0,
|
||||
attn_mask: Optional[Tensor] = None,
|
||||
src_key_padding_mask: Optional[Tensor] = None,
|
||||
) -> Tensor:
|
||||
r"""Pass the input through the encoder layers in a dual-path manner, processing both temporal and frequency dimensions.
|
||||
|
||||
Args:
|
||||
src: the dual-path sequence to the encoder (required):
|
||||
shape (batch_size, embedding_dim, seq_len, frequency_len).
|
||||
chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking. No used.
|
||||
feature_mask: something that broadcasts with src, that we'll multiply `src`
|
||||
by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim)
|
||||
attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len),
|
||||
interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
|
||||
True means masked position. May be None.
|
||||
src_key_padding_mask: the mask for padding, of shape (batch_size, seq_len); True means
|
||||
masked position. May be None.
|
||||
|
||||
Returns: a Tensor with the same shape as src.
|
||||
"""
|
||||
|
||||
# src: (b, c, t, f)
|
||||
b, c, t, f = src.size()
|
||||
src_f = src.permute(3, 0, 2, 1).contiguous().view(f, b * t, c)
|
||||
src_t = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c)
|
||||
pos_emb_f = self.encoder_pos(src_f)
|
||||
pos_emb_t = self.encoder_pos(src_t)
|
||||
|
||||
output = src
|
||||
|
||||
if not torch.jit.is_scripting() and not torch.jit.is_tracing():
|
||||
output = output * feature_mask
|
||||
|
||||
for i in range(len(self.f_layers)):
|
||||
# output_org = output
|
||||
# (b, c, t, f)
|
||||
output_f_org = output.permute(3, 2, 0,
|
||||
1).contiguous() # (f, t, b, c)
|
||||
output_f = output_f_org.view(f, t * b, c)
|
||||
# (f, t * b, c)
|
||||
output_f = self.f_layers[i](
|
||||
output_f,
|
||||
pos_emb_f,
|
||||
# chunk_size=chunk_size,
|
||||
# attn_mask=attn_mask,
|
||||
src_key_padding_mask=src_key_padding_mask,
|
||||
)
|
||||
output_f = output_f.view(f, t, b, c)
|
||||
output_f = self.bypass_layers[i * 2](output_f_org, output_f)
|
||||
|
||||
# (f, t, b, c)
|
||||
output = output_f.permute(2, 3, 1, 0).contiguous()
|
||||
# (b, c, t, f)
|
||||
# output = self.bypass_layers[i * 2](output_org, output)
|
||||
|
||||
# output_org = output
|
||||
|
||||
output_t_org = output.permute(2, 3, 0,
|
||||
1).contiguous() # (t, f, b, c)
|
||||
output_t = output_t_org.view(t, f * b, c)
|
||||
output_t = self.t_layers[i](
|
||||
output_t,
|
||||
pos_emb_t,
|
||||
# chunk_size=chunk_size,
|
||||
# attn_mask=attn_mask,
|
||||
src_key_padding_mask=src_key_padding_mask,
|
||||
)
|
||||
output_t = output_t.view(t, f, b, c)
|
||||
output_t = self.bypass_layers[i * 2 + 1](output_t_org, output_t)
|
||||
# (t, f, b, c)
|
||||
|
||||
output = output_t.permute(2, 3, 0, 1).contiguous()
|
||||
# (b, c, t, f)
|
||||
# output = self.bypass_layers[i * 2 + 1](output_org, output)
|
||||
|
||||
if not torch.jit.is_scripting() and not torch.jit.is_tracing():
|
||||
output = output * feature_mask
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class DualPathDownsampledZipformer2Encoder(nn.Module):
|
||||
r"""
|
||||
DualPathDownsampledZipformer2Encoder is a dual-path zipformer encoder evaluated at a reduced frame rate,
|
||||
after convolutional downsampling, and then upsampled again at the output, and combined
|
||||
with the origin input, so that the output has the same shape as the input.
|
||||
The features are downsampled-upsampled at the time and frequency domain.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, encoder: nn.Module, dim: int, t_downsample: int,
|
||||
f_downsample: int, dropout: FloatLike):
|
||||
"""
|
||||
Initialize the DualPathDownsampledZipformer2Encoder module with the specified
|
||||
encoder, dimension, temporal and frequency downsampling factors r, and dropout rate.
|
||||
"""
|
||||
super(DualPathDownsampledZipformer2Encoder, self).__init__()
|
||||
self.downsample_factor = t_downsample
|
||||
self.t_downsample_factor = t_downsample
|
||||
self.f_downsample_factor = f_downsample
|
||||
|
||||
if self.t_downsample_factor != 1:
|
||||
self.downsample_t = SimpleDownsample(dim, t_downsample, dropout)
|
||||
self.upsample_t = SimpleUpsample(dim, t_downsample)
|
||||
if self.f_downsample_factor != 1:
|
||||
self.downsample_f = SimpleDownsample(dim, f_downsample, dropout)
|
||||
self.upsample_f = SimpleUpsample(dim, f_downsample)
|
||||
|
||||
# self.num_layers = encoder.num_layers
|
||||
self.encoder = encoder
|
||||
|
||||
self.out_combiner = BypassModule(dim, straight_through_rate=0)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
src: Tensor,
|
||||
chunk_size: int = -1,
|
||||
feature_mask: Union[Tensor, float] = 1.0,
|
||||
attn_mask: Optional[Tensor] = None,
|
||||
src_key_padding_mask: Optional[Tensor] = None,
|
||||
) -> Tensor:
|
||||
r"""Downsample the input, process through the encoder, and then upsample back to the original shape.
|
||||
|
||||
Args:
|
||||
src: the sequence to the encoder (required): shape (batch_size, embedding_dim, seq_len, frequency_len).
|
||||
feature_mask: 1.0
|
||||
attn_mask: None
|
||||
src_key_padding_mask: None.
|
||||
|
||||
Returns: a Tensor with the same shape as src. (batch_size, embedding_dim, seq_len, frequency_len)
|
||||
"""
|
||||
# src: (b, c, t, f)
|
||||
b, c, t, f = src.size()
|
||||
# print(src.size())
|
||||
|
||||
src_orig = src.permute(2, 3, 0, 1) # (t, f, b, c)
|
||||
|
||||
# (b, c, t, f)
|
||||
src = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c)
|
||||
# -> (t, b * f, c)
|
||||
if self.t_downsample_factor != 1:
|
||||
src = self.downsample_t(src)
|
||||
# (t//ds + 1, b * f, c)
|
||||
downsample_t = src.size(0)
|
||||
src = src.view(downsample_t, b, f,
|
||||
c).permute(2, 1, 0,
|
||||
3).contiguous().view(f, b * downsample_t, c)
|
||||
# src = self.upsample_f(src)
|
||||
if self.f_downsample_factor != 1:
|
||||
src = self.downsample_f(src)
|
||||
# (f//ds + 1, b * downsample_t, c)
|
||||
downsample_f = src.size(0)
|
||||
src = src.view(downsample_f, b, downsample_t, c).permute(1, 3, 2, 0)
|
||||
# (b, c, downsample_t, downsample_f)
|
||||
# print(src.size())
|
||||
|
||||
# ds = self.downsample_factor
|
||||
# if attn_mask is not None:
|
||||
# attn_mask = attn_mask[::ds, ::ds]
|
||||
|
||||
src = self.encoder(
|
||||
src,
|
||||
chunk_size=chunk_size,
|
||||
feature_mask=feature_mask,
|
||||
attn_mask=attn_mask,
|
||||
src_key_padding_mask=src_key_padding_mask,
|
||||
)
|
||||
|
||||
# (b, c, downsample_t, downsample_f)
|
||||
src = src.permute(3, 0, 2,
|
||||
1).contiguous().view(downsample_f, b * downsample_t,
|
||||
c)
|
||||
if self.f_downsample_factor != 1:
|
||||
src = self.upsample_f(src)
|
||||
# (f, b * downsample_t, c)
|
||||
src = src[:f].view(f, b, downsample_t,
|
||||
c).permute(2, 1, 0, 3).contiguous().view(
|
||||
downsample_t, b * f, c)
|
||||
# (downsample_t, b * f, c)
|
||||
if self.t_downsample_factor != 1:
|
||||
src = self.upsample_t(src)
|
||||
# (t, b * f, c)
|
||||
src = src[:t].view(t, b, f, c).permute(0, 2, 1, 3).contiguous()
|
||||
# (t, f, b, c)
|
||||
out = self.out_combiner(src_orig, src)
|
||||
# (t, f, b, c)
|
||||
|
||||
out = out.permute(2, 3, 0, 1).contiguous()
|
||||
# (b, c, t, f)
|
||||
# print(out.size())
|
||||
|
||||
# remove any extra frames that are not a multiple of downsample_factor
|
||||
# src = src[: src_orig.shape[0]] # slice here
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Zipformer2DualPathEncoder(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
output_downsampling_factor: int = 2,
|
||||
downsampling_factor: Tuple[int] = (2, 4),
|
||||
f_downsampling_factor: Tuple[int] = None,
|
||||
encoder_dim: Union[int, Tuple[int]] = 384,
|
||||
num_encoder_layers: Union[int, Tuple[int]] = 4,
|
||||
encoder_unmasked_dim: Union[int, Tuple[int]] = 256,
|
||||
query_head_dim: Union[int, Tuple[int]] = 24,
|
||||
pos_head_dim: Union[int, Tuple[int]] = 4,
|
||||
value_head_dim: Union[int, Tuple[int]] = 12,
|
||||
num_heads: Union[int, Tuple[int]] = 8,
|
||||
feedforward_dim: Union[int, Tuple[int]] = 1536,
|
||||
cnn_module_kernel: Union[int, Tuple[int]] = 31,
|
||||
pos_dim: int = 192,
|
||||
dropout: FloatLike = None, # see code below for default
|
||||
warmup_batches: float = 4000.0,
|
||||
causal: bool = False,
|
||||
chunk_size: Tuple[int] = [-1],
|
||||
left_context_frames: Tuple[int] = [-1],
|
||||
):
|
||||
"""
|
||||
Initialize the Zipformer2DualPathEncoder module.
|
||||
Zipformer2DualPathEncoder processes the hidden features of the noisy speech using dual-path modeling.
|
||||
It has two kinds of blocks: DualPathZipformer2Encoder and DualPathDownsampledZipformer2Encoder.
|
||||
DualPathZipformer2Encoder processes the 4D features with the shape of [B, C, T, F].
|
||||
DualPathDownsampledZipformer2Encoder first downsamples the hidden features
|
||||
and processes features using dual-path modeling like DualPathZipformer2Encoder.
|
||||
|
||||
Args:
|
||||
Various hyperparameters and settings for the encoder.
|
||||
"""
|
||||
super(Zipformer2DualPathEncoder, self).__init__()
|
||||
|
||||
if dropout is None:
|
||||
dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
|
||||
|
||||
def _to_tuple(x):
|
||||
"""Converts a single int or a 1-tuple of an int to a tuple with the same length
|
||||
as downsampling_factor"""
|
||||
if isinstance(x, int):
|
||||
x = (x, )
|
||||
if len(x) == 1:
|
||||
x = x * len(downsampling_factor)
|
||||
else:
|
||||
assert len(x) == len(downsampling_factor) and isinstance(
|
||||
x[0], int)
|
||||
return x
|
||||
|
||||
self.output_downsampling_factor = output_downsampling_factor # int
|
||||
self.downsampling_factor = downsampling_factor # tuple
|
||||
|
||||
if f_downsampling_factor is None:
|
||||
f_downsampling_factor = downsampling_factor
|
||||
self.f_downsampling_factor = _to_tuple(f_downsampling_factor)
|
||||
|
||||
self.encoder_dim = encoder_dim = _to_tuple(encoder_dim) # tuple
|
||||
self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple(
|
||||
encoder_unmasked_dim) # tuple
|
||||
num_encoder_layers = _to_tuple(num_encoder_layers)
|
||||
self.num_encoder_layers = num_encoder_layers
|
||||
self.query_head_dim = query_head_dim = _to_tuple(query_head_dim)
|
||||
self.value_head_dim = value_head_dim = _to_tuple(value_head_dim)
|
||||
pos_head_dim = _to_tuple(pos_head_dim)
|
||||
self.num_heads = num_heads = _to_tuple(num_heads)
|
||||
feedforward_dim = _to_tuple(feedforward_dim)
|
||||
self.cnn_module_kernel = cnn_module_kernel = _to_tuple(
|
||||
cnn_module_kernel)
|
||||
|
||||
self.causal = causal
|
||||
self.chunk_size = chunk_size
|
||||
self.left_context_frames = left_context_frames
|
||||
|
||||
for u, d in zip(encoder_unmasked_dim, encoder_dim):
|
||||
assert u <= d
|
||||
|
||||
# each one will be Zipformer2Encoder or DownsampledZipformer2Encoder
|
||||
encoders = []
|
||||
|
||||
num_encoders = len(downsampling_factor)
|
||||
# "1,2,4,8,4,2",
|
||||
|
||||
for i in range(num_encoders):
|
||||
encoder_layer = Zipformer2EncoderLayer(
|
||||
embed_dim=encoder_dim[i],
|
||||
pos_dim=pos_dim,
|
||||
num_heads=num_heads[i],
|
||||
query_head_dim=query_head_dim[i],
|
||||
pos_head_dim=pos_head_dim[i],
|
||||
value_head_dim=value_head_dim[i],
|
||||
feedforward_dim=feedforward_dim[i],
|
||||
dropout=dropout,
|
||||
cnn_module_kernel=cnn_module_kernel[i],
|
||||
causal=causal,
|
||||
)
|
||||
|
||||
# For the segment of the warmup period, we let the Conv2dSubsampling
|
||||
# layer learn something. Then we start to warm up the other encoders.
|
||||
encoder = DualPathZipformer2Encoder(
|
||||
encoder_layer,
|
||||
num_encoder_layers[i],
|
||||
pos_dim=pos_dim,
|
||||
dropout=dropout,
|
||||
warmup_begin=warmup_batches * (i + 1) / (num_encoders + 1),
|
||||
warmup_end=warmup_batches * (i + 2) / (num_encoders + 1),
|
||||
final_layerdrop_rate=0.035 * (downsampling_factor[i]**0.5),
|
||||
bypass_layer=BypassModule(
|
||||
encoder_dim[i], straight_through_rate=0),
|
||||
)
|
||||
|
||||
if downsampling_factor[i] != 1 or f_downsampling_factor[i] != 1:
|
||||
encoder = DualPathDownsampledZipformer2Encoder(
|
||||
encoder,
|
||||
dim=encoder_dim[i],
|
||||
t_downsample=downsampling_factor[i],
|
||||
f_downsample=f_downsampling_factor[i],
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
encoders.append(encoder)
|
||||
|
||||
self.encoders = nn.ModuleList(encoders)
|
||||
|
||||
self.downsample_output = SimpleDownsample(
|
||||
max(encoder_dim),
|
||||
downsample=output_downsampling_factor,
|
||||
dropout=dropout)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Forward pass of the Zipformer2DualPathEncoder module.
|
||||
|
||||
Args:
|
||||
x (Tensor): Input tensor of shape [B, C, T, F].
|
||||
|
||||
Returns:
|
||||
Tensor: Output tensor after passing through the encoder.
|
||||
"""
|
||||
outputs = []
|
||||
|
||||
# if torch.jit.is_scripting() or torch.jit.is_tracing():
|
||||
# feature_masks = [1.0] * len(self.encoder_dim)
|
||||
# else:
|
||||
# feature_masks = self.get_feature_masks(x)
|
||||
feature_masks = [1.0] * len(self.encoder_dim)
|
||||
attn_mask = None
|
||||
|
||||
chunk_size = -1
|
||||
# left_context_chunks = -1
|
||||
|
||||
for i, module in enumerate(self.encoders):
|
||||
|
||||
x = convert_num_channels(x, self.encoder_dim[i])
|
||||
|
||||
x = module(
|
||||
x,
|
||||
chunk_size=chunk_size,
|
||||
feature_mask=feature_masks[i],
|
||||
src_key_padding_mask=None,
|
||||
attn_mask=attn_mask,
|
||||
)
|
||||
outputs.append(x)
|
||||
|
||||
# (b, c, t, f)
|
||||
return x
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# {2,2,2,2,2,2} {192,256,256,256,256,256} {512,768,768,768,768,768}
|
||||
downsampling_factor = (1, 2, 4, 3) #
|
||||
encoder_dim = (16, 32, 64, 64)
|
||||
pos_dim = 48 # zipformer base设置
|
||||
num_heads = (4, 4, 4, 4) # "4,4,4,8,4,4"
|
||||
query_head_dim = (16, ) * len(downsampling_factor) # 32
|
||||
pos_head_dim = (4, ) * len(downsampling_factor) # 4
|
||||
value_head_dim = (12, ) * len(downsampling_factor) # 12
|
||||
feedforward_dim = (32, 64, 128, 128) #
|
||||
dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
|
||||
cnn_module_kernel = (15, ) * len(downsampling_factor) # 31,31,15,15,15,31
|
||||
causal = False
|
||||
encoder_unmasked_dim = (16, ) * len(downsampling_factor)
|
||||
|
||||
num_encoder_layers = (1, 1, 1, 1)
|
||||
warmup_batches = 4000.0
|
||||
|
||||
net = Zipformer2DualPathEncoder(
|
||||
output_downsampling_factor=1,
|
||||
downsampling_factor=downsampling_factor,
|
||||
num_encoder_layers=num_encoder_layers,
|
||||
encoder_dim=encoder_dim,
|
||||
encoder_unmasked_dim=encoder_unmasked_dim,
|
||||
query_head_dim=query_head_dim,
|
||||
pos_head_dim=pos_head_dim,
|
||||
value_head_dim=value_head_dim,
|
||||
pos_dim=pos_dim,
|
||||
num_heads=num_heads,
|
||||
feedforward_dim=feedforward_dim,
|
||||
cnn_module_kernel=cnn_module_kernel,
|
||||
dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
|
||||
warmup_batches=warmup_batches,
|
||||
causal=causal,
|
||||
)
|
||||
|
||||
# net = DownsampledZipformer2Encoder(
|
||||
# None, 128, 2, 0.
|
||||
# )
|
||||
# x = torch.randn((101, 2, 128))
|
||||
b = 4
|
||||
t = 321
|
||||
f = 101
|
||||
c = 64
|
||||
|
||||
# x = torch.randn((101, 2, 128))
|
||||
x = torch.randn((b, c, t, f))
|
||||
|
||||
x = net(x)
|
||||
print(x.size())
|
||||
1084
modelscope/models/audio/ans/zipenhancer_layers/zipformer.py
Normal file
1084
modelscope/models/audio/ans/zipenhancer_layers/zipformer.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -122,3 +122,127 @@ class ANSPipeline(Pipeline):
|
||||
np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16),
|
||||
self.SAMPLE_RATE)
|
||||
return inputs
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.acoustic_noise_suppression,
|
||||
module_name=Pipelines.speech_zipenhancer_ans_multiloss_16k_base)
|
||||
class ANSZipEnhancerPipeline(Pipeline):
|
||||
r"""ANS (Acoustic Noise Suppression) Inference Pipeline .
|
||||
|
||||
When invoke the class with pipeline.__call__(), it accept only one parameter:
|
||||
inputs(str): the path of wav file
|
||||
"""
|
||||
SAMPLE_RATE = 16000
|
||||
|
||||
def __init__(self, model, **kwargs):
|
||||
"""
|
||||
use `model` and `preprocessor` to create a kws pipeline for prediction
|
||||
Args:
|
||||
model: model id on modelscope hub.
|
||||
"""
|
||||
super().__init__(model=model, **kwargs)
|
||||
self.model.eval()
|
||||
self.stream_mode = kwargs.get('stream_mode', False)
|
||||
|
||||
def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
|
||||
if self.stream_mode:
|
||||
raise TypeError('This model does not support stream mode!')
|
||||
if isinstance(inputs, bytes):
|
||||
data1, fs = sf.read(io.BytesIO(inputs))
|
||||
elif isinstance(inputs, str):
|
||||
# file_bytes = File.read(inputs)
|
||||
# data1, fs = sf.read(io.BytesIO(file_bytes))
|
||||
data1, fs = sf.read(inputs)
|
||||
else:
|
||||
raise TypeError(f'Unsupported type {type(inputs)}.')
|
||||
if len(data1.shape) > 1:
|
||||
data1 = data1[:, 0]
|
||||
if fs != self.SAMPLE_RATE:
|
||||
data1 = librosa.resample(
|
||||
data1, orig_sr=fs, target_sr=self.SAMPLE_RATE)
|
||||
data1 = audio_norm(data1)
|
||||
data = data1.astype(np.float32)
|
||||
inputs = np.reshape(data, [1, data.shape[0]])
|
||||
return {'ndarray': inputs, 'nsamples': data.shape[0]}
|
||||
|
||||
def forward(self, inputs: Dict[str, Any],
|
||||
**forward_params) -> Dict[str, Any]:
|
||||
ndarray = inputs['ndarray']
|
||||
if isinstance(ndarray, torch.Tensor):
|
||||
ndarray = ndarray.cpu().numpy()
|
||||
nsamples = inputs['nsamples']
|
||||
decode_do_segement = False
|
||||
window = 16000 * 2 # 2s
|
||||
stride = int(window * 0.75)
|
||||
print('inputs:{}'.format(ndarray.shape))
|
||||
b, t = ndarray.shape # size()
|
||||
if t > window * 5: # 10s
|
||||
decode_do_segement = True
|
||||
print('decode_do_segement')
|
||||
|
||||
if t < window:
|
||||
ndarray = np.concatenate(
|
||||
[ndarray, np.zeros((ndarray.shape[0], window - t))], 1)
|
||||
elif decode_do_segement:
|
||||
if t < window + stride:
|
||||
padding = window + stride - t
|
||||
print('padding: {}'.format(padding))
|
||||
ndarray = np.concatenate(
|
||||
[ndarray, np.zeros((ndarray.shape[0], padding))], 1)
|
||||
else:
|
||||
if (t - window) % stride != 0:
|
||||
# padding = t - (t - window) // stride * stride
|
||||
padding = (
|
||||
(t - window) // stride + 1) * stride + window - t
|
||||
print('padding: {}'.format(padding))
|
||||
ndarray = np.concatenate(
|
||||
[ndarray,
|
||||
np.zeros((ndarray.shape[0], padding))], 1)
|
||||
# else:
|
||||
# if (t - window) % stride != 0:
|
||||
# padding = t - (t - window) // stride * stride
|
||||
# print('padding: {}'.format(padding))
|
||||
# ndarray = np.concatenate(
|
||||
# [ndarray, np.zeros((ndarray.shape[0], padding))], 1)
|
||||
print('inputs after padding:{}'.format(ndarray.shape))
|
||||
with torch.no_grad():
|
||||
ndarray = torch.from_numpy(np.float32(ndarray)).to(self.device)
|
||||
b, t = ndarray.shape
|
||||
if decode_do_segement:
|
||||
outputs = np.zeros(t)
|
||||
give_up_length = (window - stride) // 2
|
||||
current_idx = 0
|
||||
while current_idx + window <= t:
|
||||
# print('current_idx: {}'.format(current_idx))
|
||||
print(
|
||||
'\rcurrent_idx: {} {:.2f}%'.format(
|
||||
current_idx, current_idx * 100 / t),
|
||||
end='')
|
||||
tmp_input = dict(noisy=ndarray[:, current_idx:current_idx
|
||||
+ window])
|
||||
tmp_output = self.model(
|
||||
tmp_input, )['wav_l2'][0].cpu().numpy()
|
||||
end_index = current_idx + window - give_up_length
|
||||
if current_idx == 0:
|
||||
outputs[current_idx:
|
||||
end_index] = tmp_output[:-give_up_length]
|
||||
else:
|
||||
outputs[current_idx
|
||||
+ give_up_length:end_index] = tmp_output[
|
||||
give_up_length:-give_up_length]
|
||||
current_idx += stride
|
||||
print('\rcurrent_idx: {} {:.2f}%'.format(current_idx, 100))
|
||||
else:
|
||||
outputs = self.model(
|
||||
dict(noisy=ndarray))['wav_l2'][0].cpu().numpy()
|
||||
outputs = (outputs[:nsamples] * 32768).astype(np.int16).tobytes()
|
||||
return {OutputKeys.OUTPUT_PCM: outputs}
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
||||
if 'output_path' in kwargs.keys():
|
||||
sf.write(
|
||||
kwargs['output_path'],
|
||||
np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16),
|
||||
self.SAMPLE_RATE)
|
||||
return inputs
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
addict
|
||||
attrs
|
||||
datasets>=3.0.0
|
||||
datasets>=3.0.0,<=3.0.1
|
||||
einops
|
||||
oss2
|
||||
Pillow
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
addict
|
||||
attrs
|
||||
datasets>=3.0.0
|
||||
datasets>=3.0.0,<=3.0.1
|
||||
einops
|
||||
oss2
|
||||
Pillow
|
||||
|
||||
@@ -150,6 +150,36 @@ class SpeechSignalProcessTest(unittest.TestCase):
|
||||
w.write(pcm)
|
||||
audio = f.read(block_size)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_zipenhancer_ans(self):
|
||||
model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
|
||||
ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
|
||||
output_path = os.path.abspath('output.wav')
|
||||
ans(os.path.join(os.getcwd(), NOISE_SPEECH_FILE),
|
||||
output_path=output_path)
|
||||
print(f'Processed audio saved to {output_path}')
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_zipenhancer_ans_url(self):
|
||||
model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
|
||||
ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
|
||||
output_path = os.path.abspath('output.wav')
|
||||
ans(NOISE_SPEECH_URL, output_path=output_path)
|
||||
print(f'Processed audio saved to {output_path}')
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_zipenhancer_ans_bytes(self):
|
||||
model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
|
||||
ans = pipeline(
|
||||
Tasks.acoustic_noise_suppression,
|
||||
model=model_id,
|
||||
pipeline_name=Pipelines.speech_zipenhancer_ans_multiloss_16k_base)
|
||||
output_path = os.path.abspath('output.wav')
|
||||
with open(os.path.join(os.getcwd(), NOISE_SPEECH_FILE), 'rb') as f:
|
||||
data = f.read()
|
||||
ans(data, output_path=output_path)
|
||||
print(f'Processed audio saved to {output_path}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user