From 181bceea241a356f71ab783bced88d079ac1c261 Mon Sep 17 00:00:00 2001 From: simpleoier Date: Tue, 11 Apr 2023 08:06:42 -0400 Subject: [PATCH] add enh / ss --- audio-chatgpt.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 ++ 2 files changed, 82 insertions(+) diff --git a/audio-chatgpt.py b/audio-chatgpt.py index 619e056..f132cfe 100644 --- a/audio-chatgpt.py +++ b/audio-chatgpt.py @@ -821,6 +821,86 @@ class TargetSoundDetection: #print(ans) return ans +class Speech_Enh_SS_SC: + """Speech Enhancement or Separation in single-channel + Example usage: + enh_model = Speech_Enh_SS("cuda") + enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav") + """ + def __init__(self, device="cuda", model_name="lichenda/chime4_fasnet_dprnn_tac"): + self.model_name = model_name + self.device = device + print("Initializing ESPnet Enh to %s" % device) + self._initialize_model() + + def _initialize_model(self): + from espnet_model_zoo.downloader import ModelDownloader + from espnet2.bin.enh_inference import SeparateSpeech + + d = ModelDownloader() + + cfg = d.download_and_unpack(self.model_name) + self.separate_speech = SeparateSpeech( + train_config=cfg["train_config"], + model_file=cfg["model_file"], + # for segment-wise process on long speech + segment_size=2.4, + hop_size=0.8, + normalize_segment_scale=False, + show_progressbar=True, + ref_channel=None, + normalize_output_wav=True, + device=self.device, + ) + + def inference(self, speech_path, ref_channel=0): + speech, sr = soundfile.read(speech_path) + speech = speech[:, ref_channel] + assert speech.dim() == 1 + + enh_speech = self.separate_speech(speech[None, ], fs=sr) + if len(enh_speech) == 1: + return enh_speech[0] + return enh_speech + +class Speech_Enh_SS_MC: + """Speech Enhancement or Separation in multi-channel""" + def __init__(self, device="cuda", model_name=None, ref_channel=4): + self.model_name = model_name + self.ref_channel = ref_channel + self.device = device + print("Initializing ESPnet Enh to %s" % device) + self._initialize_model() + + def _initialize_model(self): + from espnet_model_zoo.downloader import ModelDownloader + from espnet2.bin.enh_inference import SeparateSpeech + + d = ModelDownloader() + + cfg = d.download_and_unpack(self.model_name) + self.separate_speech = SeparateSpeech( + train_config=cfg["train_config"], + model_file=cfg["model_file"], + # for segment-wise process on long speech + segment_size=2.4, + hop_size=0.8, + normalize_segment_scale=False, + show_progressbar=True, + ref_channel=self.ref_channel, + normalize_output_wav=True, + device=self.device, + ) + + def inference(self, speech_path): + speech, sr = soundfile.read(speech_path) + speech = speech.T + + enh_speech = self.separate_speech(speech[None, ...], fs=sr) + if len(enh_speech) == 1: + return enh_speech[0] + return enh_speech + class ConversationBot: def __init__(self): print("Initializing AudioGPT") diff --git a/requirements.txt b/requirements.txt index d032daf..f884d53 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,8 @@ beautifulsoup4==4.10.0 Cython==0.29.24 diffusers einops==0.3.0 +espnet +espnet_model_zoo g2p-en==2.1.0 google==3.0.0 gradio