update

2025-12-16 11:57:58 +01:00 · 2023-03-30 21:48:59 +08:00
parent a869dcef4c
commit 7492e97023
6 changed files with 159 additions and 64 deletions
--- a/assets/7cb0d24f.wav
+++ b/assets/7cb0d24f.wav
--- a/assets/README.md
+++ b/assets/README.md
@@ -1,37 +1,19 @@
 # Prompt Example
-## Text-To-Image
-Input Example : Generate an image of a horse<br />
-Output:<br />
-![](t2i.png)<br />
-## Text-To-Audio
-Input Example : Generate an audio of a piano playing<br />
-Output:<br />
-![](t2a.png)<br />
-Audio:<br />
-<audio src="b973e878.wav" controls></audio><br />
-
-## Text-To-Speech
+## Speech
+### Text-To-Speech
 Input Example : Generate a speech with text "here we go"<br />
 Output:<br />
 ![](tts.png)<br />
 Audio:<br />
 <audio src="fd5cf55e.wav" controls></audio><br />

-## Text-To-Sing
-Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
+### Style Transfer Text-To-Speech
+First upload your audio(.wav)<br />
+Input Example : Speak using the voice of this audio. The text is "here we go".<br />
 Output:<br />
-![](t2s.png)<br />
-Audio:<br />
-<audio src="2bf90e35.wav" controls></audio><br />
-## Image-To-Audio
-First upload your image(.png)<br />
-Input Example : Generate the audio of this image<br />
-Output:<br />
-![](i2a-2.png)<br />
-Audio:<br />
-<audio src="5d67d1b9.wav" controls></audio><br />
+![](style_transfer_tts.png)<br />

-## Speech Recognition
+### Speech Recognition
 First upload your audio(.wav)<br />
 Audio Example :<br />
 <audio src="Track 4.wav" controls></audio><br />
@@ -39,16 +21,53 @@ Input Example : Generate the text of this speech<br />
 Output:<br />
 ![](asr.png)<br />

-## Audio-To-Text
+## Sing
+### Text-To-Sing
+Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
+Output:<br />
+![](t2s.png)<br />
+Audio:<br />
+<audio src="2bf90e35.wav" controls></audio><br />
+
+## Audio
+### Text-To-Audio
+Input Example : Generate an audio of a piano playing<br />
+Output:<br />
+![](t2a.png)<br />
+Audio:<br />
+<audio src="b973e878.wav" controls></audio><br />
+
+### Audio Inpainting
+First upload your audio(.wav)<br />
+Audio Example :<br />
+<audio src="drums-and-music-playing-with-a-man-speaking.wav" controls></audio><br />
+Input Example : I want to inpaint this audio.<br />
+Output:<br />
+![](inpaint-1.png)<br />
+Then you can press the "Predict Masked Place" button<br />
+Output:<br />
+![](inpaint-2.png)<br />
+Output Audio:<br />
+<audio src="7cb0d24f.wav" controls></audio><br />
+
+### Image-To-Audio
+First upload your image(.png)<br />
+Input Example : Generate the audio of this image<br />
+Output:<br />
+![](i2a-2.png)<br />
+Audio:<br />
+<audio src="5d67d1b9.wav" controls></audio><br />
+
+### Audio-To-Text
 First upload your audio(.wav)<br />
 Audio Example :<br />
 <audio src="a-group-of-sheep-are-baaing.wav" controls></audio><br />
 Input Example : Please tell me the text description of this audio.<br />
 Output:<br />
 ![](a2i.png)<br />
-## Style Transfer Text-To-Speech
-First upload your audio(.wav)<br />
-Input Example : Speak using the voice of this audio. The text is "here we go".<br />
-Output:<br />
-![](style_transfer_tts.png)<br />

+## Image
+### Text-To-Image
+Input Example : Generate an image of a horse<br />
+Output:<br />
+![](t2i.png)<br />
--- a/assets/drums-and-music-playing-with-a-man-speaking.wav
+++ b/assets/drums-and-music-playing-with-a-man-speaking.wav
--- a/assets/inpaint-1.png
+++ b/assets/inpaint-1.png
--- a/assets/inpaint-2.png
+++ b/assets/inpaint-2.png
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -6,6 +6,8 @@ sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
 import gradio as gr
+import matplotlib
+import librosa
 from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
 import torch
 from diffusers import StableDiffusionPipeline
@@ -25,6 +27,7 @@ from omegaconf import OmegaConf
 from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
 import cv2
 import einops
+from einops import repeat
 from pytorch_lightning import seed_everything
 import random
 from ldm.util import instantiate_from_config
@@ -112,7 +115,15 @@ def initialize_model(config, ckpt, device):
    sampler = DDIMSampler(model)
    return sampler

-
+def initialize_model_inpaint(config, ckpt):
+    config = OmegaConf.load(config)
+    model = instantiate_from_config(config.model)
+    model.load_state_dict(torch.load(ckpt,map_location='cpu')["state_dict"], strict=False)
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    print(model.device,device,model.cond_stage_model.device)
+    sampler = DDIMSampler(model)
+    return sampler
 def select_best_audio(prompt,wav_list):
    clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
    text_embeddings = clap_model.get_text_embeddings([prompt])
@@ -127,7 +138,6 @@ def select_best_audio(prompt,wav_list):
    return wav_list[max_index]


-
 class T2I:
    def __init__(self, device):
        print("Initializing T2I to %s" % device)
@@ -342,9 +352,10 @@ class Inpaint:
    def __init__(self, device):
        print("Initializing Make-An-Audio-inpaint to %s" % device)
        self.device = device
-        self.sampler = initialize_model('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
+        self.sampler = initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
        self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
-    def make_batch_sd(mel, mask, num_samples=1):
+        self.cmap_transform = matplotlib.cm.viridis
+    def make_batch_sd(self, mel, mask, num_samples=1):

        mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
        mask = torch.from_numpy(mask)[None,None,...].to(dtype=torch.float32)
@@ -360,10 +371,11 @@ class Inpaint:
             "masked_mel": repeat(masked_mel.to(device=self.device), "1 ... -> n ...", n=num_samples),
        }
        return batch
-    def gen_mel(input_audio):
-        sr,ori_wav = input_audio
+    def gen_mel(self, input_audio_path):
+        SAMPLE_RATE = 16000
+        sr, ori_wav = wavfile.read(input_audio_path)
+        print("gen_mel")
        print(sr,ori_wav.shape,ori_wav)
-
        ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储，不用管
        if len(ori_wav.shape)==2:# stereo
            ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
@@ -379,12 +391,35 @@ class Inpaint:
 
        mel = TRANSFORMS_16000(input_wav)
        return mel
-    def show_mel_fn(input_audio):
+    def gen_mel_audio(self, input_audio):
+        SAMPLE_RATE = 16000
+        sr,ori_wav = input_audio
+        print("gen_mel_audio")
+        print(sr,ori_wav.shape,ori_wav)
+
+        ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储，不用管
+        if len(ori_wav.shape)==2:# stereo
+            ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
+        print(sr,ori_wav.shape,ori_wav)
+        ori_wav = librosa.resample(ori_wav,orig_sr = sr,target_sr = SAMPLE_RATE)
+
+        mel_len,hop_size = 848,256
+        input_len = mel_len * hop_size
+        if len(ori_wav) < input_len:
+            input_wav = np.pad(ori_wav,(0,mel_len*hop_size),constant_values=0)
+        else:
+            input_wav = ori_wav[:input_len]
+        mel = TRANSFORMS_16000(input_wav)
+        return mel
+    def show_mel_fn(self, input_audio_path):
        crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
-        crop_mel = self.gen_mel(input_audio)[:,:crop_len]
-        color_mel = cmap_transform(crop_mel)
-        return Image.fromarray((color_mel*255).astype(np.uint8))
-    def inpaint(batch, seed, ddim_steps, num_samples=1, W=512, H=512):
+        crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
+        color_mel = self.cmap_transform(crop_mel)
+        image = Image.fromarray((color_mel*255).astype(np.uint8))
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        image.save(image_filename)
+        return image_filename
+    def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
        model = self.sampler.model
    
        prng = np.random.RandomState(seed)
@@ -414,17 +449,19 @@ class Inpaint:
        inapint_wav = self.vocoder.vocode(inpainted)

        return inpainted, inapint_wav
-    def predict(input_audio,mel_and_mask,ddim_steps,seed):
-        show_mel = np.array(mel_and_mask['image'].convert("L"))/255 # 由于展示的mel只展示了一部分，所以需要重新从音频生成mel
-        mask = np.array(mel_and_mask["mask"].convert("L"))/255
-
+    def inference(self, input_audio, mel_and_mask, seed = 55, ddim_steps = 100):
+        SAMPLE_RATE = 16000
+        torch.set_grad_enabled(False)
+        mel_img = Image.open(mel_and_mask['image'])
+        mask_img = Image.open(mel_and_mask["mask"])
+        show_mel = np.array(mel_img.convert("L"))/255 # 由于展示的mel只展示了一部分，所以需要重新从音频生成mel
+        mask = np.array(mask_img.convert("L"))/255
        mel_bins,mel_len = 80,848
-
-        input_mel = self.gen_mel(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分，所以需要重新从音频生成mel
+        input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分，所以需要重新从音频生成mel
        mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)# 将mask填充到原来的mel的大小
        print(mask.shape,input_mel.shape)
        with torch.no_grad():
-            batch = make_batch_sd(input_mel,mask,device,num_samples=1)
+            batch = self.make_batch_sd(input_mel,mask,num_samples=1)
            inpainted,gen_wav = self.inpaint(
                batch=batch,
                seed=seed,
@@ -433,10 +470,15 @@ class Inpaint:
                H=mel_bins, W=mel_len
            )
        inpainted = inpainted[:,:show_mel.shape[1]]
-        color_mel = cmap_transform(inpainted)
+        color_mel = self.cmap_transform(inpainted)
        input_len = int(input_audio[1].shape[0] * SAMPLE_RATE / input_audio[0])
        gen_wav = (gen_wav * 32768).astype(np.int16)[:input_len]
-        return Image.fromarray((color_mel*255).astype(np.uint8)),(SAMPLE_RATE,gen_wav)
+        image = Image.fromarray((color_mel*255).astype(np.uint8))
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        image.save(image_filename)
+        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        soundfile.write(audio_filename, gen_wav, samplerate = 16000)
+        return image_filename, audio_filename
    
 class ASR:
    def __init__(self, device):
@@ -474,7 +516,8 @@ class ConversationBot:
        self.i2a = I2A(device="cuda:1")
        self.a2t = A2T(device="cuda:2")
        self.asr = ASR(device="cuda:1")
-        self.tts_ood = TTS_OOD(device="cuda:0")
+        self.inpaint = Inpaint(device="cuda:0")
+        #self.tts_ood = TTS_OOD(device="cuda:0")
        self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
        self.tools = [
            Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
@@ -486,11 +529,11 @@ class ConversationBot:
            Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
                 description="useful for when you want to generate an audio from a user input text and it saved it to a file."
                             "The input to this tool should be a string, representing the text used to generate audio."),
-            Tool(
-                name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
-                description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
-                            "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
-                            "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
+            # Tool(
+            #     name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
+            #     description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
+            #                 "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
+            #                 "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
            Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
                 description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
                             "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
@@ -506,6 +549,9 @@ class ConversationBot:
            Tool(name="Generate Text From The Audio", func=self.a2t.inference,
                 description="useful for when you want to describe an audio in text, receives audio_path as input."
                             "The input to this tool should be a string, representing the audio_path."), 
+            Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
+                 description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
+                             "The input to this tool should be a string, representing the audio_path."), 
            Tool(name="Transcribe speech", func=self.asr.inference,
                 description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
                             "The input to this tool should be a string, representing the audio_path.")]
@@ -529,7 +575,7 @@ class ConversationBot:
            response = res['output']
            state = state + [(text, response)]
            print("Outputs:", state)
-            return state, state, None
+            return state, state, None, None
        else:
            tool = res['intermediate_steps'][0][0].tool
            if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
@@ -537,13 +583,22 @@ class ConversationBot:
                response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
                state = state + [(text, response)]
                print("Outputs:", state)
-                return state, state, None
+                return state, state, None, None
+            elif tool == "Audio Inpainting":
+                audio_filename = res['intermediate_steps'][0][0].tool_input
+                image_filename = res['intermediate_steps'][0][1]
                print("======>Current memory:\n %s" % self.agent.memory)
-            response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
-            #response = res['output'] + f"<audio src=audio_filename controls=controls></audio>"
+                print(res)
+                response = res['output']
                state = state + [(text, response)]
                print("Outputs:", state)
-            return state, state, audio_filename
+                return state, state, audio_filename, image_filename
+            print("======>Current memory:\n %s" % self.agent.memory)
+            response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
+            audio_filename = res['intermediate_steps'][0][1]
+            state = state + [(text, response)]
+            print("Outputs:", state)
+            return state, state, audio_filename, None

    def run_image_or_audio(self, file, state, txt):
        file_type = file.name[-3:]
@@ -559,6 +614,7 @@ class ConversationBot:
                           "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
            AI_prompt = "Received.  "
            self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
+            print("======>Current memory:\n %s" % self.agent.memory)
            #state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
            state = state + [(f"*{audio_filename}*", AI_prompt)]
            print("Outputs:", state)
@@ -587,6 +643,19 @@ class ConversationBot:
            print("Outputs:", state)
            return state, state, txt + ' ' + image_filename + ' ', None

+    def inpainting(self, state, audio_filename, image_filename):
+        print("===============Running inpainting =============")
+        print("Inputs:", state)
+        print("======>Previous memory:\n %s" % self.agent.memory)
+        inpaint = Inpaint(device="cuda:0")
+        new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)       
+        AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
+        self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
+        print("======>Current memory:\n %s" % self.agent.memory)
+        state = state + [(f"Audio Inpainting", AI_prompt)]
+        print("Outputs:", state)
+        return state, state, None, new_audio_filename
+

 if __name__ == '__main__':
    bot = ConversationBot()
@@ -604,11 +673,18 @@ if __name__ == '__main__':
                btn = gr.UploadButton("Upload", file_types=["image","audio"])
        with gr.Column():
            outaudio = gr.Audio()
-        txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio])
+        with gr.Row():
+            with gr.Column():
+                show_mel = gr.Image(type="filepath",tool='sketch')
+                run_button = gr.Button("Predict Masked Place")
+            
+        txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel])
        txt.submit(lambda: "", None, txt)
        btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
+        run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio])
        clear.click(bot.memory.clear)
        clear.click(lambda: [], None, chatbot)
        clear.click(lambda: [], None, state)
+        clear.click(lambda: None, None, show_mel)
        clear.click(lambda: None, None, outaudio)
        demo.launch(server_name="0.0.0.0", server_port=7860, share=True)