add enh / ss

2025-12-16 11:57:58 +01:00 · 2023-04-11 08:06:42 -04:00
parent e2b06d3c79
commit 181bceea24
2 changed files with 82 additions and 0 deletions
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -821,6 +821,86 @@ class TargetSoundDetection:
        #print(ans)
        return ans
 class Speech_Enh_SS_SC:
    """Speech Enhancement or Separation in single-channel
    Example usage:
        enh_model = Speech_Enh_SS("cuda")
        enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav")
    """
    def __init__(self, device="cuda", model_name="lichenda/chime4_fasnet_dprnn_tac"):
        self.model_name = model_name
        self.device = device
        print("Initializing ESPnet Enh to %s" % device)
        self._initialize_model()
    def _initialize_model(self):
        from espnet_model_zoo.downloader import ModelDownloader
        from espnet2.bin.enh_inference import SeparateSpeech
        d = ModelDownloader()
        cfg = d.download_and_unpack(self.model_name)
        self.separate_speech = SeparateSpeech(
            train_config=cfg["train_config"],
            model_file=cfg["model_file"],
            # for segment-wise process on long speech
            segment_size=2.4,
            hop_size=0.8,
            normalize_segment_scale=False,
            show_progressbar=True,
            ref_channel=None,
            normalize_output_wav=True,
            device=self.device,
        )
    def inference(self, speech_path, ref_channel=0):
        speech, sr = soundfile.read(speech_path)
        speech = speech[:, ref_channel]
        assert speech.dim() == 1
        enh_speech = self.separate_speech(speech[None, ], fs=sr)
        if len(enh_speech) == 1:
            return enh_speech[0]
        return enh_speech
 class Speech_Enh_SS_MC:
    """Speech Enhancement or Separation in multi-channel"""
    def __init__(self, device="cuda", model_name=None, ref_channel=4):
        self.model_name = model_name
        self.ref_channel = ref_channel
        self.device = device
        print("Initializing ESPnet Enh to %s" % device)
        self._initialize_model()
    def _initialize_model(self):
        from espnet_model_zoo.downloader import ModelDownloader
        from espnet2.bin.enh_inference import SeparateSpeech
        d = ModelDownloader()
        cfg = d.download_and_unpack(self.model_name)
        self.separate_speech = SeparateSpeech(
            train_config=cfg["train_config"],
            model_file=cfg["model_file"],
            # for segment-wise process on long speech
            segment_size=2.4,
            hop_size=0.8,
            normalize_segment_scale=False,
            show_progressbar=True,
            ref_channel=self.ref_channel,
            normalize_output_wav=True,
            device=self.device,
        )
    def inference(self, speech_path):
        speech, sr = soundfile.read(speech_path)
        speech = speech.T
        enh_speech = self.separate_speech(speech[None, ...], fs=sr)
        if len(enh_speech) == 1:
            return enh_speech[0]
        return enh_speech
 class ConversationBot:
    def __init__(self):
        print("Initializing AudioGPT")
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,8 @@ beautifulsoup4==4.10.0
 Cython==0.29.24
 diffusers
 einops==0.3.0
 espnet
 espnet_model_zoo
 g2p-en==2.1.0
 google==3.0.0
 gradio