update

2025-12-16 11:57:58 +01:00 · 2023-04-13 15:52:37 +08:00
parent 70d54b5c9b
commit 7c6f83a889
1 changed files with 87 additions and 24 deletions
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -4,8 +4,6 @@ sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
-sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
-sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
 import gradio as gr
@@ -57,23 +55,18 @@ AudioGPT can not directly read audios, but it has a list of tools to finish diff
 AudioGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
 Human may provide new audios to AudioGPT with a description. The description helps AudioGPT to understand this audio, but AudioGPT should use tools to finish following tasks, rather than directly imagine from the description.
 Overall, AudioGPT is a powerful audio dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. 
-
 TOOLS:
 ------
-
 AudioGPT has access to the following tools:"""

 AUDIO_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
-
 ```
 Thought: Do I need to use a tool? Yes
 Action: the action to take, should be one of [{tool_names}]
 Action Input: the input to the action
 Observation: the result of the action
 ```
-
 When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
-
 ```
 Thought: Do I need to use a tool? No
 {ai_prefix}: [your response here]
@@ -82,9 +75,7 @@ Thought: Do I need to use a tool? No

 AUDIO_CHATGPT_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if not exists.
 You will remember to provide the audio file name loyally if it's provided in the last tool observation.
-
 Begin!
-
 Previous conversation history:
 {chat_history}
 New input: {input}
@@ -864,7 +855,7 @@ class Speech_Enh_SS_SC:
        enh_model = Speech_Enh_SS("cuda")
        enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav")
    """
-    def __init__(self, device="cuda", model_name="lichenda/chime4_fasnet_dprnn_tac"):
+    def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chime4_enh_train_enh_conv_tasnet_raw"):
        self.model_name = model_name
        self.device = device
        print("Initializing ESPnet Enh to %s" % device)
@@ -893,16 +884,68 @@ class Speech_Enh_SS_SC:
    def inference(self, speech_path, ref_channel=0):
        speech, sr = soundfile.read(speech_path)
        speech = speech[:, ref_channel]
-        assert speech.dim() == 1
+        # speech = torch.from_numpy(speech)
+        # assert speech.dim() == 1
+        enh_speech = self.separate_speech(speech[None, ...], fs=sr)
+        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        # if len(enh_speech) == 1:
+        soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
+            # return enh_speech[0]
+        # return enh_speech
+        # else: 
+        #     print("############")
+        #     audio_filename_1 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        #     soundfile.write(audio_filename_1, enh_speech[0].squeeze(), samplerate=sr)
+        #     audio_filename_2 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        #     soundfile.write(audio_filename_2, enh_speech[1].squeeze(), samplerate=sr)
+        #     audio_filename = merge_audio(audio_filename_1, audio_filename_2)
+        return audio_filename

-        enh_speech = self.separate_speech(speech[None, ], fs=sr)
+class Speech_SS:
+    def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
+        self.model_name = model_name
+        self.device = device
+        print("Initializing ESPnet SS to %s" % device)
+        self._initialize_model()
+
+    def _initialize_model(self):
+        from espnet_model_zoo.downloader import ModelDownloader
+        from espnet2.bin.enh_inference import SeparateSpeech
+
+        d = ModelDownloader()
+
+        cfg = d.download_and_unpack(self.model_name)
+        self.separate_speech = SeparateSpeech(
+            train_config=cfg["train_config"],
+            model_file=cfg["model_file"],
+            # for segment-wise process on long speech
+            segment_size=2.4,
+            hop_size=0.8,
+            normalize_segment_scale=False,
+            show_progressbar=True,
+            ref_channel=None,
+            normalize_output_wav=True,
+            device=self.device,
+        )
+
+    def inference(self, speech_path):
+        speech, sr = soundfile.read(speech_path)
+        enh_speech = self.separate_speech(speech[None, ...], fs=sr)
+        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
        if len(enh_speech) == 1:
-            return enh_speech[0]
-        return enh_speech
+            soundfile.write(audio_filename, enh_speech[0], samplerate=sr)
+        else:
+            # print("############")
+            audio_filename_1 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+            soundfile.write(audio_filename_1, enh_speech[0].squeeze(), samplerate=sr)
+            audio_filename_2 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+            soundfile.write(audio_filename_2, enh_speech[1].squeeze(), samplerate=sr)
+            audio_filename = merge_audio(audio_filename_1, audio_filename_2)
+        return audio_filename

 class Speech_Enh_SS_MC:
    """Speech Enhancement or Separation in multi-channel"""
-    def __init__(self, device="cuda", model_name=None, ref_channel=4):
+    def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chime4_enh_train_enh_dc_crn_mapping_snr_raw", ref_channel=4):
        self.model_name = model_name
        self.ref_channel = ref_channel
        self.device = device
@@ -932,11 +975,16 @@ class Speech_Enh_SS_MC:
    def inference(self, speech_path):
        speech, sr = soundfile.read(speech_path)
        speech = speech.T
-
+        print(speech[None, ...])
        enh_speech = self.separate_speech(speech[None, ...], fs=sr)
-        if len(enh_speech) == 1:
-            return enh_speech[0]
-        return enh_speech
+        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        # if len(enh_speech) == 1:
+        soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
+            # return enh_speech[0]
+        # return enh_speech
+        # else:
+            # soundfile.write(audio_filename, enh_speech, samplerate=sr)
+        return audio_filename

 class ConversationBot:
    def __init__(self):
@@ -950,6 +998,9 @@ class ConversationBot:
        self.i2a = I2A(device="cuda:0")
        self.a2t = A2T(device="cpu")
        self.asr = ASR(device="cuda:0")
+        self.SE_SS_SC = Speech_Enh_SS_SC(device="cuda:0")
+        # self.SE_SS_MC = Speech_Enh_SS_MC(device="cuda:0")
+        self.SS = Speech_SS(device="cuda:0")
        self.inpaint = Inpaint(device="cuda:0")
        self.tts_ood = TTS_OOD(device="cpu")
        self.geneface = GeneFace(device="cuda:0")
@@ -985,6 +1036,19 @@ class ConversationBot:
                Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
                     description="useful for when you want to convert a user input text into speech audio it saved it to a file."
                                 "The input to this tool should be a string, representing the text used to be converted to speech."),
+                # Tool(name="Speech Enhancement Or Separation In Single-Channel", func=self.SE_SS_SC.inference,
+                #      description="useful for when you want to enhance the quality of the speech signal by reducing background noise (single-channel), "
+                #                  "or separate each speech from the speech mixture (single-channel), receives audio_path as input."
+                #                  "The input to this tool should be a string, representing the audio_path."),
+                Tool(name="Speech Enhancement In Single-Channel", func=self.SE_SS_SC.inference,
+                     description="useful for when you want to enhance the quality of the speech signal by reducing background noise (single-channel), receives audio_path as input."
+                                 "The input to this tool should be a string, representing the audio_path."),
+                Tool(name="Speech Separation In Single-Channel", func=self.SS.inference,
+                     description="useful for when you want to separate each speech from the speech mixture, receives audio_path as input."
+                                 "The input to this tool should be a string, representing the audio_path."),
+                # Tool(name="Speech Enhancement In Multi-Channel", func=self.SE_SS_MC.inference,
+                #      description="useful for when you want to enhance the quality of the speech signal by reducing background noise (multi-channel), receives audio_path as input."
+                #                  "The input to this tool should be a string, representing the audio_path."),                                 
                Tool(name="Generate Audio From The Image", func=self.i2a.inference,
                     description="useful for when you want to generate an audio based on an image."
                                  "The input to this tool should be a string, representing the image_path. "),
@@ -1128,13 +1192,13 @@ class ConversationBot:
            print("Inputs:", file, state)
            print("======>Previous memory:\n %s" % self.agent.memory)
            audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
-            audio_load = whisper.load_audio(file.name)
-            soundfile.write(audio_filename, audio_load, samplerate = 16000)
+            # audio_load = whisper.load_audio(file.name)
+            audio_load, sr = soundfile.read(file.name)
+            soundfile.write(audio_filename, audio_load, samplerate = sr)
            description = self.a2t.inference(audio_filename)
            Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
                           "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
            AI_prompt = "Received.  "
-            output_audio_filename = self.tts.inference(AI_prompt)
            self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
            print("======>Current memory:\n %s" % self.agent.memory)
            #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
@@ -1159,7 +1223,6 @@ class ConversationBot:
            Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to understand this image, but you should use tools to finish following tasks, " \
                           "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(image_filename, description)
            AI_prompt = "Received.  "
-            output_audio_filename = self.tts.inference(AI_prompt)
            self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
            print("======>Current memory:\n %s" % self.agent.memory)
            state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
@@ -1315,4 +1378,4 @@ if __name__ == '__main__':
        clear_speech.click(lambda: [], None, state)
        clear_speech.click(bot.clear_video, None, outvideo)

-        demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
+        demo.launch(server_name="0.0.0.0", server_port=7861, share=True)