Merge branch 'main' into hzq

# Conflicts: # audio-chatgpt.py
2025-12-16 20:07:58 +01:00 · 2023-04-01 15:27:48 +08:00
parent 7a315a8492 f6c8cbe737
commit a23e0f32e9
7 changed files with 171 additions and 73 deletions
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@

 ## Capabilities

-Up-to-date link: https://eac422a9e2289d6b.gradio.app/
+Up-to-date link: https://93868c7fa583f4b5.gradio.app

 Here we list the capability of AudioGPT at this time. More supported models and tasks are comming soon. For prompt examples, refer to [asset](assets/README.md).

--- a/assets/7cb0d24f.wav
+++ b/assets/7cb0d24f.wav
--- a/assets/README.md
+++ b/assets/README.md
@@ -1,37 +1,19 @@
 # Prompt Example
-## Text-To-Image
-Input Example : Generate an image of a horse<br />
-Output:<br />
-![](t2i.png)<br />
-## Text-To-Audio
-Input Example : Generate an audio of a piano playing<br />
-Output:<br />
-![](t2a.png)<br />
-Audio:<br />
-<audio src="b973e878.wav" controls></audio><br />
-
-## Text-To-Speech
+## Speech
+### Text-To-Speech
 Input Example : Generate a speech with text "here we go"<br />
 Output:<br />
 ![](tts.png)<br />
 Audio:<br />
 <audio src="fd5cf55e.wav" controls></audio><br />

-## Text-To-Sing
-Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
+### Style Transfer Text-To-Speech
+First upload your audio(.wav)<br />
+Input Example : Speak using the voice of this audio. The text is "here we go".<br />
 Output:<br />
-![](t2s.png)<br />
-Audio:<br />
-<audio src="2bf90e35.wav" controls></audio><br />
-## Image-To-Audio
-First upload your image(.png)<br />
-Input Example : Generate the audio of this image<br />
-Output:<br />
-![](i2a-2.png)<br />
-Audio:<br />
-<audio src="5d67d1b9.wav" controls></audio><br />
+![](style_transfer_tts.png)<br />

-## Speech Recognition
+### Speech Recognition
 First upload your audio(.wav)<br />
 Audio Example :<br />
 <audio src="Track 4.wav" controls></audio><br />
@@ -39,16 +21,53 @@ Input Example : Generate the text of this speech<br />
 Output:<br />
 ![](asr.png)<br />

-## Audio-To-Text
+## Sing
+### Text-To-Sing
+Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
+Output:<br />
+![](t2s.png)<br />
+Audio:<br />
+<audio src="2bf90e35.wav" controls></audio><br />
+
+## Audio
+### Text-To-Audio
+Input Example : Generate an audio of a piano playing<br />
+Output:<br />
+![](t2a.png)<br />
+Audio:<br />
+<audio src="b973e878.wav" controls></audio><br />
+
+### Audio Inpainting
+First upload your audio(.wav)<br />
+Audio Example :<br />
+<audio src="drums-and-music-playing-with-a-man-speaking.wav" controls></audio><br />
+Input Example : I want to inpaint this audio.<br />
+Output:<br />
+![](inpaint-1.png)<br />
+Then you can press the "Predict Masked Place" button<br />
+Output:<br />
+![](inpaint-2.png)<br />
+Output Audio:<br />
+<audio src="7cb0d24f.wav" controls></audio><br />
+
+### Image-To-Audio
+First upload your image(.png)<br />
+Input Example : Generate the audio of this image<br />
+Output:<br />
+![](i2a-2.png)<br />
+Audio:<br />
+<audio src="5d67d1b9.wav" controls></audio><br />
+
+### Audio-To-Text
 First upload your audio(.wav)<br />
 Audio Example :<br />
 <audio src="a-group-of-sheep-are-baaing.wav" controls></audio><br />
 Input Example : Please tell me the text description of this audio.<br />
 Output:<br />
 ![](a2i.png)<br />
-## Style Transfer Text-To-Speech
-First upload your audio(.wav)<br />
-Input Example : Speak using the voice of this audio. The text is "here we go".<br />
-Output:<br />
-![](style_transfer_tts.png)<br />

+## Image
+### Text-To-Image
+Input Example : Generate an image of a horse<br />
+Output:<br />
+![](t2i.png)<br />
--- a/assets/drums-and-music-playing-with-a-man-speaking.wav
+++ b/assets/drums-and-music-playing-with-a-man-speaking.wav
--- a/assets/inpaint-1.png
+++ b/assets/inpaint-1.png
--- a/assets/inpaint-2.png
+++ b/assets/inpaint-2.png
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -6,10 +6,11 @@ sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Neura
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
+import matplotlib
+import librosa
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 from diffusers import StableDiffusionPipeline
-from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
 from langchain.agents.initialize import initialize_agent
 from langchain.agents.tools import Tool
 from langchain.chains.conversation.memory import ConversationBufferMemory
@@ -17,20 +18,13 @@ from langchain.llms.openai import OpenAI
 import re
 import uuid
 import soundfile
-from scipy.io import wavfile
-from diffusers import StableDiffusionInpaintPipeline
 from PIL import Image
 import numpy as np
 from omegaconf import OmegaConf
-from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
-import cv2
-import einops
-from pytorch_lightning import seed_everything
-import random
+from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
+from einops import repeat
 from ldm.util import instantiate_from_config
 from ldm.data.extract_mel_spectrogram import TRANSFORMS_16000
-from pathlib import Path
-from vocoder.hifigan.modules import VocoderHifigan
 from vocoder.bigvgan.models import VocoderBigVGAN
 from ldm.models.diffusion.ddim import DDIMSampler
 from wav_evaluation.models.CLAPWrapper import CLAPWrapper
@@ -110,6 +104,15 @@ def initialize_model(config, ckpt, device):
    sampler = DDIMSampler(model)
    return sampler

+def initialize_model_inpaint(config, ckpt):
+    config = OmegaConf.load(config)
+    model = instantiate_from_config(config.model)
+    model.load_state_dict(torch.load(ckpt,map_location='cpu')["state_dict"], strict=False)
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    print(model.device,device,model.cond_stage_model.device)
+    sampler = DDIMSampler(model)
+    return sampler

 def select_best_audio(prompt,wav_list):
    clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
@@ -124,6 +127,7 @@ def select_best_audio(prompt,wav_list):
    print(score_list,max_index)
    return wav_list[max_index]

+
 class T2I:
    def __init__(self, device):
        print("Initializing T2I to %s" % device)
@@ -348,9 +352,10 @@ class Inpaint:
    def __init__(self, device):
        print("Initializing Make-An-Audio-inpaint to %s" % device)
        self.device = device
-        self.sampler = initialize_model('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
+        self.sampler = initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
        self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
-    def make_batch_sd(mel, mask, num_samples=1):
+        self.cmap_transform = matplotlib.cm.viridis
+    def make_batch_sd(self, mel, mask, num_samples=1):

        mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
        mask = torch.from_numpy(mask)[None,None,...].to(dtype=torch.float32)
@@ -366,10 +371,11 @@ class Inpaint:
             "masked_mel": repeat(masked_mel.to(device=self.device), "1 ... -> n ...", n=num_samples),
        }
        return batch
-    def gen_mel(input_audio):
-        sr,ori_wav = input_audio
+    def gen_mel(self, input_audio_path):
+        SAMPLE_RATE = 16000
+        sr, ori_wav = wavfile.read(input_audio_path)
+        print("gen_mel")
        print(sr,ori_wav.shape,ori_wav)
-
        ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储，不用管
        if len(ori_wav.shape)==2:# stereo
            ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
@@ -385,12 +391,35 @@ class Inpaint:

        mel = TRANSFORMS_16000(input_wav)
        return mel
-    def show_mel_fn(input_audio):
+    def gen_mel_audio(self, input_audio):
+        SAMPLE_RATE = 16000
+        sr,ori_wav = input_audio
+        print("gen_mel_audio")
+        print(sr,ori_wav.shape,ori_wav)
+
+        ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储，不用管
+        if len(ori_wav.shape)==2:# stereo
+            ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
+        print(sr,ori_wav.shape,ori_wav)
+        ori_wav = librosa.resample(ori_wav,orig_sr = sr,target_sr = SAMPLE_RATE)
+
+        mel_len,hop_size = 848,256
+        input_len = mel_len * hop_size
+        if len(ori_wav) < input_len:
+            input_wav = np.pad(ori_wav,(0,mel_len*hop_size),constant_values=0)
+        else:
+            input_wav = ori_wav[:input_len]
+        mel = TRANSFORMS_16000(input_wav)
+        return mel
+    def show_mel_fn(self, input_audio_path):
        crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
-        crop_mel = self.gen_mel(input_audio)[:,:crop_len]
-        color_mel = cmap_transform(crop_mel)
-        return Image.fromarray((color_mel*255).astype(np.uint8))
-    def inpaint(batch, seed, ddim_steps, num_samples=1, W=512, H=512):
+        crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
+        color_mel = self.cmap_transform(crop_mel)
+        image = Image.fromarray((color_mel*255).astype(np.uint8))
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        image.save(image_filename)
+        return image_filename
+    def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
        model = self.sampler.model

        prng = np.random.RandomState(seed)
@@ -411,7 +440,6 @@ class Inpaint:
        x_samples_ddim = model.decode_first_stage(samples_ddim)


-        mask = batch["mask"]# [-1,1]
        mel = torch.clamp((batch["mel"]+1.0)/2.0,min=0.0, max=1.0)
        mask = torch.clamp((batch["mask"]+1.0)/2.0,min=0.0, max=1.0)
        predicted_mel = torch.clamp((x_samples_ddim+1.0)/2.0,min=0.0, max=1.0)
@@ -420,17 +448,19 @@ class Inpaint:
        inapint_wav = self.vocoder.vocode(inpainted)

        return inpainted, inapint_wav
-    def predict(input_audio,mel_and_mask,ddim_steps,seed):
-        show_mel = np.array(mel_and_mask['image'].convert("L"))/255 # 由于展示的mel只展示了一部分，所以需要重新从音频生成mel
-        mask = np.array(mel_and_mask["mask"].convert("L"))/255
-
+    def inference(self, input_audio, mel_and_mask, seed = 55, ddim_steps = 100):
+        SAMPLE_RATE = 16000
+        torch.set_grad_enabled(False)
+        mel_img = Image.open(mel_and_mask['image'])
+        mask_img = Image.open(mel_and_mask["mask"])
+        show_mel = np.array(mel_img.convert("L"))/255 # 由于展示的mel只展示了一部分，所以需要重新从音频生成mel
+        mask = np.array(mask_img.convert("L"))/255
        mel_bins,mel_len = 80,848
-
-        input_mel = self.gen_mel(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分，所以需要重新从音频生成mel
+        input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分，所以需要重新从音频生成mel
        mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)# 将mask填充到原来的mel的大小
        print(mask.shape,input_mel.shape)
        with torch.no_grad():
-            batch = make_batch_sd(input_mel,mask,device,num_samples=1)
+            batch = self.make_batch_sd(input_mel,mask,num_samples=1)
            inpainted,gen_wav = self.inpaint(
                batch=batch,
                seed=seed,
@@ -439,10 +469,15 @@ class Inpaint:
                H=mel_bins, W=mel_len
            )
        inpainted = inpainted[:,:show_mel.shape[1]]
-        color_mel = cmap_transform(inpainted)
+        color_mel = self.cmap_transform(inpainted)
        input_len = int(input_audio[1].shape[0] * SAMPLE_RATE / input_audio[0])
        gen_wav = (gen_wav * 32768).astype(np.int16)[:input_len]
-        return Image.fromarray((color_mel*255).astype(np.uint8)),(SAMPLE_RATE,gen_wav)
+        image = Image.fromarray((color_mel*255).astype(np.uint8))
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        image.save(image_filename)
+        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        soundfile.write(audio_filename, gen_wav, samplerate = 16000)
+        return image_filename, audio_filename
    
 class ASR:
    def __init__(self, device):
@@ -481,6 +516,7 @@ class ConversationBot:
        self.i2a = I2A(device="cuda:1")
        self.a2t = A2T(device="cuda:2")
        self.asr = ASR(device="cuda:1")
+        self.inpaint = Inpaint(device="cuda:0")
        self.tts_ood = TTS_OOD(device="cuda:0")
        self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
        self.tools = [
@@ -513,6 +549,9 @@ class ConversationBot:
            Tool(name="Generate Text From The Audio", func=self.a2t.inference,
                 description="useful for when you want to describe an audio in text, receives audio_path as input."
                             "The input to this tool should be a string, representing the audio_path."),
+            Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
+                 description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
+                             "The input to this tool should be a string, representing the audio_path."),
            Tool(name="Transcribe speech", func=self.asr.inference,
                 description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
                             "The input to this tool should be a string, representing the audio_path.")]
@@ -536,7 +575,7 @@ class ConversationBot:
            response = res['output']
            state = state + [(text, response)]
            print("Outputs:", state)
-            return state, state, None
+            return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
        else:
            tool = res['intermediate_steps'][0][0].tool
            if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
@@ -544,13 +583,23 @@ class ConversationBot:
                response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
                state = state + [(text, response)]
                print("Outputs:", state)
-                return state, state, None
+                return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
+            elif tool == "Audio Inpainting":
+                audio_filename = res['intermediate_steps'][0][0].tool_input
+                image_filename = res['intermediate_steps'][0][1]
+               # self.is_visible(True)
                print("======>Current memory:\n %s" % self.agent.memory)
-            response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
-            #response = res['output'] + f"<audio src=audio_filename controls=controls></audio>"
+                print(res)
+                response = res['output']
                state = state + [(text, response)]
                print("Outputs:", state)
-            return state, state, audio_filename
+                return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
+            print("======>Current memory:\n %s" % self.agent.memory)
+            response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
+            audio_filename = res['intermediate_steps'][0][1]
+            state = state + [(text, response)]
+            print("Outputs:", state)
+            return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(visible=False), gr.Button.update(visible=False)

    def run_image_or_audio(self, file, state, txt):
        file_type = file.name[-3:]
@@ -566,10 +615,11 @@ class ConversationBot:
                           "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
            AI_prompt = "Received.  "
            self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
+            print("======>Current memory:\n %s" % self.agent.memory)
            #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
            state = state + [(f"*{audio_filename}*", AI_prompt)]
            print("Outputs:", state)
-            return state, state, txt + ' ' + audio_filename + ' ', audio_filename
+            return state, state, txt + ' ' + audio_filename + ' ', gr.Audio.update(value=audio_filename,visible=True)
        else:
            print("===============Running run_image =============")
            print("Inputs:", file, state)
@@ -592,7 +642,26 @@ class ConversationBot:
            print("======>Current memory:\n %s" % self.agent.memory)
            state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
            print("Outputs:", state)
-            return state, state, txt + ' ' + image_filename + ' ', None
+            return state, state, txt + ' ' + image_filename + ' ', gr.Audio.update(visible=False)
+
+    def inpainting(self, state, audio_filename, image_filename):
+        print("===============Running inpainting =============")
+        print("Inputs:", state)
+        print("======>Previous memory:\n %s" % self.agent.memory)
+        inpaint = Inpaint(device="cuda:0")
+        new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
+        AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
+        self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
+        print("======>Current memory:\n %s" % self.agent.memory)
+        state = state + [(f"Audio Inpainting", AI_prompt)]
+        print("Outputs:", state)
+        return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
+    def clear_audio(self):
+        return gr.Audio.update(value=None, visible=False)
+    def clear_image(self):
+        return gr.Image.update(value=None, visible=False)
+    def clear_button(self):
+        return gr.Button.update(visible=False)


 if __name__ == '__main__':
@@ -610,12 +679,22 @@ if __name__ == '__main__':
            with gr.Column(scale=0.15, min_width=0):
                btn = gr.UploadButton("Upload", file_types=["image","audio"])
        with gr.Column():
-            outaudio = gr.Audio()
-        txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio])
+            outaudio = gr.Audio(visible=False)
+        with gr.Row():
+            with gr.Column():
+                show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
+                run_button = gr.Button("Predict Masked Place",visible=False)
+
+
+        txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
        txt.submit(lambda: "", None, txt)
        btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
+        run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, run_button])
        clear.click(bot.memory.clear)
        clear.click(lambda: [], None, chatbot)
        clear.click(lambda: [], None, state)
-        clear.click(lambda: None, None, outaudio)
+        clear.click(lambda:None, None, txt)
+        clear.click(bot.clear_button, None, run_button)
+        clear.click(bot.clear_image, None, show_mel)
+        clear.click(bot.clear_audio, None, outaudio)
        demo.launch(server_name="0.0.0.0", server_port=7860, share=True)