From b386a4ee501218f23cab703a6f79daac160f28cd Mon Sep 17 00:00:00 2001 From: "shiyi.zxh" Date: Mon, 28 Nov 2022 17:48:10 +0800 Subject: [PATCH] adapt to different wav input Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10886461 --- modelscope/preprocessors/ofa/asr.py | 12 +++++++++--- modelscope/preprocessors/ofa/base.py | 11 ++++++++--- requirements/multi-modal.txt | 1 + 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/modelscope/preprocessors/ofa/asr.py b/modelscope/preprocessors/ofa/asr.py index 928698c6..d74c2550 100644 --- a/modelscope/preprocessors/ofa/asr.py +++ b/modelscope/preprocessors/ofa/asr.py @@ -5,6 +5,7 @@ import random from pathlib import Path from typing import Any, Dict +import librosa import soundfile as sf import torch from fairseq.data.audio.feature_transforms import \ @@ -54,9 +55,13 @@ class OfaASRPreprocessor(OfaBasePreprocessor): def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: speed = random.choice([0.9, 1.0, 1.1]) - wav, sr = sf.read(self.column_map['wav']) + wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True) fbank = self.prepare_fbank( - torch.tensor([wav], dtype=torch.float32), sr, speed, is_train=True) + torch.tensor([wav], dtype=torch.float32), + sr, + speed, + target_sample_rate=16000, + is_train=True) fbank_mask = torch.tensor([True]) sample = { 'fbank': fbank, @@ -86,11 +91,12 @@ class OfaASRPreprocessor(OfaBasePreprocessor): def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: speed = 1.0 - wav, sr = sf.read(data[self.column_map['wav']]) + wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True) fbank = self.prepare_fbank( torch.tensor([wav], dtype=torch.float32), sr, speed, + target_sample_rate=16000, is_train=False) fbank_mask = torch.tensor([True]) diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py index 64bec9c9..8f18fe7a 100644 --- a/modelscope/preprocessors/ofa/base.py +++ b/modelscope/preprocessors/ofa/base.py @@ -170,10 +170,15 @@ class OfaBasePreprocessor: else load_image(path_or_url_or_pil) return image - def prepare_fbank(self, waveform, sample_rate, speed, is_train): - waveform, _ = torchaudio.sox_effects.apply_effects_tensor( + def prepare_fbank(self, + waveform, + sample_rate, + speed, + target_sample_rate=16000, + is_train=False): + waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor( waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) + [['speed', str(speed)], ['rate', str(target_sample_rate)]]) _waveform, _ = convert_waveform( waveform, sample_rate, to_mono=True, normalize_volume=True) # Kaldi compliance: 16-bit signed integers diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt index 54049c56..9c144a99 100644 --- a/requirements/multi-modal.txt +++ b/requirements/multi-modal.txt @@ -1,4 +1,5 @@ ftfy>=6.0.3 +librosa ofa>=0.0.2 pycocoevalcap>=1.2 pycocotools>=2.0.4