diff --git a/data/test/audios/speech_with_noise_48k.pcm b/data/test/audios/speech_with_noise_48k.pcm new file mode 100644 index 00000000..3e4af18f Binary files /dev/null and b/data/test/audios/speech_with_noise_48k.pcm differ diff --git a/data/test/audios/speech_with_noise_48k.wav b/data/test/audios/speech_with_noise_48k.wav new file mode 100644 index 00000000..ccee3da3 --- /dev/null +++ b/data/test/audios/speech_with_noise_48k.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9e76c8448e93934ed9c8827b76f702d07fccc3e586900903617971471235800 +size 475278 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index c1af4119..ba01b2e8 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -161,6 +161,7 @@ class Models(object): # audio models sambert_hifigan = 'sambert-hifigan' speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' + speech_dfsmn_ans = 'speech_dfsmn_ans' speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield' speech_mossformer_separation_temporal_8k = 'speech_mossformer_separation_temporal_8k' @@ -441,6 +442,7 @@ class Pipelines(object): sambert_hifigan_tts = 'sambert-hifigan-tts' speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k' speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' + speech_dfsmn_ans_psm_48k_causal = 'speech_dfsmn_ans_psm_48k_causal' speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' speech_separation = 'speech-separation' kws_kwsbp = 'kws-kwsbp' diff --git a/modelscope/models/audio/ans/__init__.py b/modelscope/models/audio/ans/__init__.py index afcdf314..b88a787a 100644 --- a/modelscope/models/audio/ans/__init__.py +++ b/modelscope/models/audio/ans/__init__.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: else: _import_structure = { 'frcrn': ['FRCRNDecorator'], + 'dnoise_net': ['DenoiseNet'], } import sys diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py index beaa3187..98bfd8b5 100644 --- a/modelscope/models/audio/ans/complex_nn.py +++ b/modelscope/models/audio/ans/complex_nn.py @@ -7,57 +7,8 @@ import torch import torch.nn as nn -import torch.nn.functional as F - -class UniDeepFsmn(nn.Module): - - def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None): - super(UniDeepFsmn, self).__init__() - - self.input_dim = input_dim - self.output_dim = output_dim - - if lorder is None: - return - - self.lorder = lorder - self.hidden_size = hidden_size - - self.linear = nn.Linear(input_dim, hidden_size) - - self.project = nn.Linear(hidden_size, output_dim, bias=False) - - self.conv1 = nn.Conv2d( - output_dim, - output_dim, [lorder, 1], [1, 1], - groups=output_dim, - bias=False) - - def forward(self, input): - r""" - - Args: - input: torch with shape: batch (b) x sequence(T) x feature (h) - - Returns: - batch (b) x channel (c) x sequence(T) x feature (h) - """ - f1 = F.relu(self.linear(input)) - - p1 = self.project(f1) - - x = torch.unsqueeze(p1, 1) - # x: batch (b) x channel (c) x sequence(T) x feature (h) - x_per = x.permute(0, 3, 2, 1) - # x_per: batch (b) x feature (h) x sequence(T) x channel (c) - y = F.pad(x_per, [0, 0, self.lorder - 1, 0]) - - out = x_per + self.conv1(y) - - out1 = out.permute(0, 3, 2, 1) - # out1: batch (b) x channel (c) x sequence(T) x feature (h) - return input + out1.squeeze() +from modelscope.models.audio.ans.layers.uni_deep_fsmn import UniDeepFsmn class ComplexUniDeepFsmn(nn.Module): diff --git a/modelscope/models/audio/ans/denoise_net.py b/modelscope/models/audio/ans/denoise_net.py new file mode 100644 index 00000000..9d20074b --- /dev/null +++ b/modelscope/models/audio/ans/denoise_net.py @@ -0,0 +1,73 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Related papers: +# Shengkui Zhao, Trung Hieu Nguyen, Bin Ma, “Monaural Speech Enhancement with Complex Convolutional +# Block Attention Module and Joint Time Frequency Losses”, ICASSP 2021. +# Shiliang Zhang, Ming Lei, Zhijie Yan, Lirong Dai, “Deep-FSMN for Large Vocabulary Continuous Speech +# Recognition “, arXiv:1803.05030, 2018. + +from torch import nn + +from modelscope.metainfo import Models +from modelscope.models import MODELS, TorchModel +from modelscope.models.audio.ans.layers.activations import (RectifiedLinear, + Sigmoid) +from modelscope.models.audio.ans.layers.affine_transform import AffineTransform +from modelscope.models.audio.ans.layers.uni_deep_fsmn import UniDeepFsmn +from modelscope.utils.constant import Tasks + + +@MODELS.register_module( + Tasks.acoustic_noise_suppression, module_name=Models.speech_dfsmn_ans) +class DfsmnAns(TorchModel): + """Denoise model with DFSMN. + + Args: + model_dir (str): the model path. + fsmn_depth (int): the depth of deepfsmn + lorder (int): + """ + + def __init__(self, + model_dir: str, + fsmn_depth=9, + lorder=20, + *args, + **kwargs): + super().__init__(model_dir, *args, **kwargs) + self.lorder = lorder + self.linear1 = AffineTransform(120, 256) + self.relu = RectifiedLinear(256, 256) + repeats = [ + UniDeepFsmn(256, 256, lorder, 256) for i in range(fsmn_depth) + ] + self.deepfsmn = nn.Sequential(*repeats) + self.linear2 = AffineTransform(256, 961) + self.sig = Sigmoid(961, 961) + + def forward(self, input): + """ + Args: + input: fbank feature [batch_size,number_of_frame,feature_dimension] + + Returns: + mask value [batch_size, number_of_frame, FFT_size/2+1] + """ + x1 = self.linear1(input) + x2 = self.relu(x1) + x3 = self.deepfsmn(x2) + x4 = self.linear2(x3) + x5 = self.sig(x4) + return x5 + + def to_kaldi_nnet(self): + re_str = '' + re_str += '\n' + re_str += self.linear1.to_kaldi_nnet() + re_str += self.relu.to_kaldi_nnet() + for dfsmn in self.deepfsmn: + re_str += dfsmn.to_kaldi_nnet() + re_str += self.linear2.to_kaldi_nnet() + re_str += self.sig.to_kaldi_nnet() + re_str += '\n' + + return re_str diff --git a/modelscope/models/audio/ans/frcrn.py b/modelscope/models/audio/ans/frcrn.py index 220a14aa..0a83dfae 100644 --- a/modelscope/models/audio/ans/frcrn.py +++ b/modelscope/models/audio/ans/frcrn.py @@ -78,7 +78,7 @@ class FRCRN(nn.Module): win_len=400, win_inc=100, fft_len=512, - win_type='hanning', + win_type='hann', **kwargs): r""" Args: diff --git a/modelscope/models/audio/ans/layers/__init__.py b/modelscope/models/audio/ans/layers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/audio/ans/layers/activations.py b/modelscope/models/audio/ans/layers/activations.py new file mode 100644 index 00000000..406de736 --- /dev/null +++ b/modelscope/models/audio/ans/layers/activations.py @@ -0,0 +1,62 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import torch.nn as nn + +from modelscope.models.audio.ans.layers.layer_base import LayerBase + + +class RectifiedLinear(LayerBase): + + def __init__(self, input_dim, output_dim): + super(RectifiedLinear, self).__init__() + self.dim = input_dim + self.relu = nn.ReLU() + + def forward(self, input): + return self.relu(input) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.dim, self.dim) + return re_str + + def load_kaldi_nnet(self, instr): + return instr + + +class LogSoftmax(LayerBase): + + def __init__(self, input_dim, output_dim): + super(LogSoftmax, self).__init__() + self.dim = input_dim + self.ls = nn.LogSoftmax() + + def forward(self, input): + return self.ls(input) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.dim, self.dim) + return re_str + + def load_kaldi_nnet(self, instr): + return instr + + +class Sigmoid(LayerBase): + + def __init__(self, input_dim, output_dim): + super(Sigmoid, self).__init__() + self.dim = input_dim + self.sig = nn.Sigmoid() + + def forward(self, input): + return self.sig(input) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.dim, self.dim) + return re_str + + def load_kaldi_nnet(self, instr): + return instr diff --git a/modelscope/models/audio/ans/layers/affine_transform.py b/modelscope/models/audio/ans/layers/affine_transform.py new file mode 100644 index 00000000..d3cad181 --- /dev/null +++ b/modelscope/models/audio/ans/layers/affine_transform.py @@ -0,0 +1,86 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import torch as th +import torch.nn as nn + +from modelscope.models.audio.ans.layers.layer_base import (LayerBase, + to_kaldi_matrix) +from modelscope.utils.audio.audio_utils import (expect_kaldi_matrix, + expect_token_number) + + +class AffineTransform(LayerBase): + + def __init__(self, input_dim, output_dim): + super(AffineTransform, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + + self.linear = nn.Linear(input_dim, output_dim) + + def forward(self, input): + return self.linear(input) + + def to_kaldi_nnet(self): + re_str = '' + + re_str += ' %d %d\n' % (self.output_dim, + self.input_dim) + + re_str += ' 1 1 0\n' + + linear_weights = self.state_dict()['linear.weight'] + + x = linear_weights.squeeze().numpy() + + re_str += to_kaldi_matrix(x) + + linear_bias = self.state_dict()['linear.bias'] + + x = linear_bias.squeeze().numpy() + + re_str += to_kaldi_matrix(x) + + return re_str + + def load_kaldi_nnet(self, instr): + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('AffineTransform format error') + + instr, lr = output + + output = expect_token_number(instr, '') + if output is None: + raise Exception('AffineTransform format error') + + instr, lr = output + + output = expect_token_number(instr, '') + if output is None: + raise Exception('AffineTransform format error') + + instr, lr = output + + output = expect_kaldi_matrix(instr) + + if output is None: + raise Exception('AffineTransform format error') + + instr, mat = output + + self.linear.weight = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('AffineTransform format error') + + instr, mat = output + self.linear.bias = th.nn.Parameter( + th.from_numpy(mat).type(th.FloatTensor)) + return instr diff --git a/modelscope/models/audio/ans/layers/layer_base.py b/modelscope/models/audio/ans/layers/layer_base.py new file mode 100644 index 00000000..ca713d2f --- /dev/null +++ b/modelscope/models/audio/ans/layers/layer_base.py @@ -0,0 +1,31 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import abc + +import numpy as np +import six +import torch.nn as nn + + +def to_kaldi_matrix(np_mat): + """ function that transform as str numpy mat to standard kaldi str matrix + + Args: + np_mat: numpy mat + """ + np.set_printoptions(threshold=np.inf, linewidth=np.nan) + out_str = str(np_mat) + out_str = out_str.replace('[', '') + out_str = out_str.replace(']', '') + return '[ %s ]\n' % out_str + + +@six.add_metaclass(abc.ABCMeta) +class LayerBase(nn.Module): + + def __init__(self): + super(LayerBase, self).__init__() + + @abc.abstractmethod + def to_kaldi_nnet(self): + pass diff --git a/modelscope/models/audio/ans/layers/uni_deep_fsmn.py b/modelscope/models/audio/ans/layers/uni_deep_fsmn.py new file mode 100644 index 00000000..772e6048 --- /dev/null +++ b/modelscope/models/audio/ans/layers/uni_deep_fsmn.py @@ -0,0 +1,156 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modelscope.models.audio.ans.layers.layer_base import (LayerBase, + to_kaldi_matrix) +from modelscope.utils.audio.audio_utils import (expect_kaldi_matrix, + expect_token_number) + + +class UniDeepFsmn(LayerBase): + + def __init__(self, input_dim, output_dim, lorder=1, hidden_size=None): + super(UniDeepFsmn, self).__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + self.lorder = lorder + self.hidden_size = hidden_size + + self.linear = nn.Linear(input_dim, hidden_size) + self.project = nn.Linear(hidden_size, output_dim, bias=False) + self.conv1 = nn.Conv2d( + output_dim, + output_dim, (lorder, 1), (1, 1), + groups=output_dim, + bias=False) + + def forward(self, input): + """ + + Args: + input: torch with shape: batch (b) x sequence(T) x feature (h) + + Returns: + batch (b) x channel (c) x sequence(T) x feature (h) + """ + f1 = F.relu(self.linear(input)) + p1 = self.project(f1) + x = torch.unsqueeze(p1, 1) + # x: batch (b) x channel (c) x sequence(T) x feature (h) + x_per = x.permute(0, 3, 2, 1) + # x_per: batch (b) x feature (h) x sequence(T) x channel (c) + y = F.pad(x_per, [0, 0, self.lorder - 1, 0]) + + out = x_per + self.conv1(y) + out1 = out.permute(0, 3, 2, 1) + # out1: batch (b) x channel (c) x sequence(T) x feature (h) + return input + out1.squeeze() + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n'\ + % (self.output_dim, self.input_dim) + re_str += ' %d %d %d %d 0\n'\ + % (1, self.hidden_size, self.lorder, 1) + + lfiters = self.state_dict()['conv1.weight'] + x = np.flipud(lfiters.squeeze().numpy().T) + re_str += to_kaldi_matrix(x) + proj_weights = self.state_dict()['project.weight'] + x = proj_weights.squeeze().numpy() + re_str += to_kaldi_matrix(x) + linear_weights = self.state_dict()['linear.weight'] + x = linear_weights.squeeze().numpy() + re_str += to_kaldi_matrix(x) + linear_bias = self.state_dict()['linear.bias'] + x = linear_bias.squeeze().numpy() + re_str += to_kaldi_matrix(x) + return re_str + + def load_kaldi_nnet(self, instr): + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error') + instr, lr = output + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error') + instr, hiddensize = output + self.hidden_size = int(hiddensize) + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error') + instr, lorder = output + self.lorder = int(lorder) + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error') + instr, lstride = output + self.lstride = lstride + + output = expect_token_number( + instr, + '', + ) + if output is None: + raise Exception('UniDeepFsmn format error') + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('Fsmn format error') + instr, mat = output + mat1 = np.fliplr(mat.T).copy() + self.conv1 = nn.Conv2d( + self.output_dim, + self.output_dim, (self.lorder, 1), (1, 1), + groups=self.output_dim, + bias=False) + mat_th = torch.from_numpy(mat1).type(torch.FloatTensor) + mat_th = mat_th.unsqueeze(1) + mat_th = mat_th.unsqueeze(3) + self.conv1.weight = torch.nn.Parameter(mat_th) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error') + instr, mat = output + + self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False) + self.linear = nn.Linear(self.input_dim, self.hidden_size) + self.project.weight = torch.nn.Parameter( + torch.from_numpy(mat).type(torch.FloatTensor)) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error') + instr, mat = output + self.linear.weight = torch.nn.Parameter( + torch.from_numpy(mat).type(torch.FloatTensor)) + + output = expect_kaldi_matrix(instr) + if output is None: + raise Exception('UniDeepFsmn format error') + instr, mat = output + self.linear.bias = torch.nn.Parameter( + torch.from_numpy(mat).type(torch.FloatTensor)) + return instr diff --git a/modelscope/pipelines/audio/__init__.py b/modelscope/pipelines/audio/__init__.py index c38c9762..18e8b8b3 100644 --- a/modelscope/pipelines/audio/__init__.py +++ b/modelscope/pipelines/audio/__init__.py @@ -14,6 +14,7 @@ if TYPE_CHECKING: from .speaker_verification_pipeline import SpeakerVerificationPipeline else: _import_structure = { + 'ans_dfsmn_pipeline': ['ANSDFSMNPipeline'], 'ans_pipeline': ['ANSPipeline'], 'asr_inference_pipeline': ['AutomaticSpeechRecognitionPipeline'], 'kws_farfield_pipeline': ['KWSFarfieldPipeline'], diff --git a/modelscope/pipelines/audio/ans_dfsmn_pipeline.py b/modelscope/pipelines/audio/ans_dfsmn_pipeline.py new file mode 100644 index 00000000..fad77091 --- /dev/null +++ b/modelscope/pipelines/audio/ans_dfsmn_pipeline.py @@ -0,0 +1,187 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import collections +import io +import os +import sys +from typing import Any, Dict + +import librosa +import numpy as np +import soundfile as sf +import torch + +from modelscope.fileio import File +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import ModelFile, Tasks + +HOP_LENGTH = 960 +N_FFT = 1920 +WINDOW_NAME_HAM = 'hamming' +STFT_WIN_LEN = 1920 +WINLEN = 3840 +STRIDE = 1920 + + +@PIPELINES.register_module( + Tasks.acoustic_noise_suppression, + module_name=Pipelines.speech_dfsmn_ans_psm_48k_causal) +class ANSDFSMNPipeline(Pipeline): + """ANS (Acoustic Noise Suppression) inference pipeline based on DFSMN model. + + Args: + stream_mode: set its work mode, default False + In stream model, it accepts bytes as pipeline input that should be the audio data in PCM format. + In normal model, it accepts str and treat it as the path of local wav file or the http link of remote wav file. + """ + SAMPLE_RATE = 48000 + + def __init__(self, model, **kwargs): + super().__init__(model=model, **kwargs) + model_bin_file = os.path.join(self.model.model_dir, + ModelFile.TORCH_MODEL_BIN_FILE) + if os.path.exists(model_bin_file): + checkpoint = torch.load(model_bin_file, map_location=self.device) + self.model.load_state_dict(checkpoint) + self.model.eval() + self.stream_mode = kwargs.get('stream_mode', False) + if self.stream_mode: + # the unit of WINLEN and STRIDE is frame, 1 frame of 16bit = 2 bytes + byte_buffer_length = \ + (WINLEN + STRIDE * (self.model.lorder - 1)) * 2 + self.buffer = collections.deque(maxlen=byte_buffer_length) + # padding head + for i in range(STRIDE * 2): + self.buffer.append(b'\0') + # it processes WINLEN frames at the first time, then STRIDE frames + self.byte_length_remain = (STRIDE * 2 - WINLEN) * 2 + self.first_forward = True + self.tensor_give_up_length = (WINLEN - STRIDE) // 2 + + window = torch.hamming_window( + STFT_WIN_LEN, periodic=False, device=self.device) + + def stft(x): + return torch.stft( + x, + N_FFT, + HOP_LENGTH, + STFT_WIN_LEN, + center=False, + window=window) + + def istft(x, slen): + return librosa.istft( + x, + hop_length=HOP_LENGTH, + win_length=STFT_WIN_LEN, + window=WINDOW_NAME_HAM, + center=False, + length=slen) + + self.stft = stft + self.istft = istft + + def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: + if self.stream_mode: + if not isinstance(inputs, bytes): + raise TypeError('Only support bytes in stream mode.') + if len(inputs) > self.buffer.maxlen: + raise ValueError( + f'inputs length too large: {len(inputs)} > {self.buffer.maxlen}' + ) + tensor_list = [] + current_index = 0 + while self.byte_length_remain + len( + inputs) - current_index >= STRIDE * 2: + byte_length_to_add = STRIDE * 2 - self.byte_length_remain + for i in range(current_index, + current_index + byte_length_to_add): + self.buffer.append(inputs[i].to_bytes( + 1, byteorder=sys.byteorder, signed=False)) + bytes_io = io.BytesIO() + for b in self.buffer: + bytes_io.write(b) + data = np.frombuffer(bytes_io.getbuffer(), dtype=np.int16) + data_tensor = torch.from_numpy(data).type(torch.FloatTensor) + tensor_list.append(data_tensor) + self.byte_length_remain = 0 + current_index += byte_length_to_add + for i in range(current_index, len(inputs)): + self.buffer.append(inputs[i].to_bytes( + 1, byteorder=sys.byteorder, signed=False)) + self.byte_length_remain += 1 + return {'audio': tensor_list} + else: + if isinstance(inputs, str): + data_bytes = File.read(inputs) + elif isinstance(inputs, bytes): + data_bytes = inputs + else: + raise TypeError(f'Unsupported type {type(inputs)}.') + data_tensor = self.bytes2tensor(data_bytes) + return {'audio': data_tensor} + + def bytes2tensor(self, file_bytes): + data1, fs = sf.read(io.BytesIO(file_bytes)) + data1 = data1.astype(np.float32) + if len(data1.shape) > 1: + data1 = data1[:, 0] + if fs != self.SAMPLE_RATE: + data1 = librosa.resample(data1, fs, self.SAMPLE_RATE) + data = data1 * 32768 + data_tensor = torch.from_numpy(data).type(torch.FloatTensor) + return data_tensor + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + if self.stream_mode: + bytes_io = io.BytesIO() + for origin_audio in inputs['audio']: + masked_sig = self._forward(origin_audio) + if self.first_forward: + masked_sig = masked_sig[:-self.tensor_give_up_length] + self.first_forward = False + else: + masked_sig = masked_sig[-WINLEN:] + masked_sig = masked_sig[self.tensor_give_up_length:-self. + tensor_give_up_length] + bytes_io.write(masked_sig.astype(np.int16).tobytes()) + outputs = bytes_io.getvalue() + else: + origin_audio = inputs['audio'] + masked_sig = self._forward(origin_audio) + outputs = masked_sig.astype(np.int16).tobytes() + return {OutputKeys.OUTPUT_PCM: outputs} + + def _forward(self, origin_audio): + with torch.no_grad(): + audio_in = origin_audio.unsqueeze(0) + import torchaudio + fbanks = torchaudio.compliance.kaldi.fbank( + audio_in, + dither=1.0, + frame_length=40.0, + frame_shift=20.0, + num_mel_bins=120, + sample_frequency=self.SAMPLE_RATE, + window_type=WINDOW_NAME_HAM) + fbanks = fbanks.unsqueeze(0) + masks = self.model(fbanks) + spectrum = self.stft(origin_audio) + masks = masks.permute(2, 1, 0) + masked_spec = (spectrum * masks).cpu() + masked_spec = masked_spec.detach().numpy() + masked_spec_complex = masked_spec[:, :, 0] + 1j * masked_spec[:, :, 1] + masked_sig = self.istft(masked_spec_complex, len(origin_audio)) + return masked_sig + + def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]: + if not self.stream_mode and 'output_path' in kwargs.keys(): + sf.write( + kwargs['output_path'], + np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16), + self.SAMPLE_RATE) + return inputs diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py index c12c9817..3719689c 100644 --- a/modelscope/pipelines/audio/ans_pipeline.py +++ b/modelscope/pipelines/audio/ans_pipeline.py @@ -36,8 +36,11 @@ class ANSPipeline(Pipeline): """ super().__init__(model=model, **kwargs) self.model.eval() + self.stream_mode = kwargs.get('stream_mode', False) def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: + if self.stream_mode: + raise TypeError('This model does not support stream mode!') if isinstance(inputs, bytes): data1, fs = sf.read(io.BytesIO(inputs)) elif isinstance(inputs, str): diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py index 9be97016..1e440882 100644 --- a/modelscope/utils/audio/audio_utils.py +++ b/modelscope/utils/audio/audio_utils.py @@ -105,6 +105,28 @@ def extract_pcm_from_wav(wav: bytes) -> bytes: return data, sample_rate +def expect_token_number(instr, token): + first_token = re.match(r'^\s*' + token, instr) + if first_token is None: + return None + instr = instr[first_token.end():] + lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr) + if lr is None: + return None + return instr[lr.end():], lr.groups()[0] + + +def expect_kaldi_matrix(instr): + pos2 = instr.find('[', 0) + pos3 = instr.find(']', pos2) + mat = [] + for stt in instr[pos2 + 1:pos3].split('\n'): + tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ') + if tmp_mat.size > 0: + mat.append(tmp_mat) + return instr[pos3 + 1:], np.array(mat) + + # This implementation is adopted from scipy.io.wavfile.write, # made publicly available under the BSD-3-Clause license at # https://github.com/scipy/scipy/blob/v1.9.3/scipy/io/wavfile.py diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py index 2916d31a..2c26cee6 100644 --- a/tests/pipelines/test_speech_signal_process.py +++ b/tests/pipelines/test_speech_signal_process.py @@ -4,6 +4,7 @@ import os.path import unittest from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck @@ -17,6 +18,8 @@ FAREND_SPEECH_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/' \ 'test/audios/farend_speech.wav' NOISE_SPEECH_FILE = 'data/test/audios/speech_with_noise.wav' +NOISE_SPEECH_FILE_48K = 'data/test/audios/speech_with_noise_48k.wav' +NOISE_SPEECH_FILE_48K_PCM = 'data/test/audios/speech_with_noise_48k.PCM' NOISE_SPEECH_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/' \ 'test/audios/speech_with_noise.wav' @@ -83,7 +86,7 @@ class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck): print(f'Processed audio saved to {output_path}') @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') - def test_ans(self): + def test_frcrn_ans(self): model_id = 'damo/speech_frcrn_ans_cirm_16k' ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id) output_path = os.path.abspath('output.wav') @@ -112,6 +115,41 @@ class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck): ans(data, output_path=output_path) print(f'Processed audio saved to {output_path}') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_dfsmn_ans(self): + model_id = 'damo/speech_dfsmn_ans_psm_48k_causal' + ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id) + output_path = os.path.abspath('output.wav') + ans(os.path.join(os.getcwd(), NOISE_SPEECH_FILE_48K), + output_path=output_path) + print(f'Processed audio saved to {output_path}') + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_dfsmn_ans_bytes(self): + model_id = 'damo/speech_dfsmn_ans_psm_48k_causal' + ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id) + output_path = os.path.abspath('output.wav') + with open(os.path.join(os.getcwd(), NOISE_SPEECH_FILE_48K), 'rb') as f: + data = f.read() + ans(data, output_path=output_path) + print(f'Processed audio saved to {output_path}') + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_dfsmn_ans_stream(self): + model_id = 'damo/speech_dfsmn_ans_psm_48k_causal' + ans = pipeline( + Tasks.acoustic_noise_suppression, model=model_id, stream_mode=True) + with open(os.path.join(os.getcwd(), NOISE_SPEECH_FILE_48K_PCM), + 'rb') as f: + block_size = 3840 + audio = f.read(block_size) + with open('output.pcm', 'wb') as w: + while len(audio) >= block_size: + result = ans(audio) + pcm = result[OutputKeys.OUTPUT_PCM] + w.write(pcm) + audio = f.read(block_size) + @unittest.skip('demo compatibility test is only enabled on a needed-basis') def test_demo_compatibility(self): self.compatibility_check()