mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-25 12:39:25 +01:00
adapt to different wav input
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10886461
This commit is contained in:
@@ -5,6 +5,7 @@ import random
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
import torch
|
||||
from fairseq.data.audio.feature_transforms import \
|
||||
@@ -54,9 +55,13 @@ class OfaASRPreprocessor(OfaBasePreprocessor):
|
||||
|
||||
def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
speed = random.choice([0.9, 1.0, 1.1])
|
||||
wav, sr = sf.read(self.column_map['wav'])
|
||||
wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True)
|
||||
fbank = self.prepare_fbank(
|
||||
torch.tensor([wav], dtype=torch.float32), sr, speed, is_train=True)
|
||||
torch.tensor([wav], dtype=torch.float32),
|
||||
sr,
|
||||
speed,
|
||||
target_sample_rate=16000,
|
||||
is_train=True)
|
||||
fbank_mask = torch.tensor([True])
|
||||
sample = {
|
||||
'fbank': fbank,
|
||||
@@ -86,11 +91,12 @@ class OfaASRPreprocessor(OfaBasePreprocessor):
|
||||
|
||||
def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
speed = 1.0
|
||||
wav, sr = sf.read(data[self.column_map['wav']])
|
||||
wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True)
|
||||
fbank = self.prepare_fbank(
|
||||
torch.tensor([wav], dtype=torch.float32),
|
||||
sr,
|
||||
speed,
|
||||
target_sample_rate=16000,
|
||||
is_train=False)
|
||||
fbank_mask = torch.tensor([True])
|
||||
|
||||
|
||||
@@ -170,10 +170,15 @@ class OfaBasePreprocessor:
|
||||
else load_image(path_or_url_or_pil)
|
||||
return image
|
||||
|
||||
def prepare_fbank(self, waveform, sample_rate, speed, is_train):
|
||||
waveform, _ = torchaudio.sox_effects.apply_effects_tensor(
|
||||
def prepare_fbank(self,
|
||||
waveform,
|
||||
sample_rate,
|
||||
speed,
|
||||
target_sample_rate=16000,
|
||||
is_train=False):
|
||||
waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
|
||||
waveform, sample_rate,
|
||||
[['speed', str(speed)], ['rate', str(sample_rate)]])
|
||||
[['speed', str(speed)], ['rate', str(target_sample_rate)]])
|
||||
_waveform, _ = convert_waveform(
|
||||
waveform, sample_rate, to_mono=True, normalize_volume=True)
|
||||
# Kaldi compliance: 16-bit signed integers
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
ftfy>=6.0.3
|
||||
librosa
|
||||
ofa>=0.0.2
|
||||
pycocoevalcap>=1.2
|
||||
pycocotools>=2.0.4
|
||||
|
||||
Reference in New Issue
Block a user