mirror of
https://github.com/AIGC-Audio/AudioGPT.git
synced 2025-12-16 20:07:58 +01:00
Merge branch 'main' into hzq
# Conflicts: # audio-chatgpt.py
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
|
||||
## Capabilities
|
||||
|
||||
Up-to-date link: https://eac422a9e2289d6b.gradio.app/
|
||||
Up-to-date link: https://93868c7fa583f4b5.gradio.app
|
||||
|
||||
Here we list the capability of AudioGPT at this time. More supported models and tasks are comming soon. For prompt examples, refer to [asset](assets/README.md).
|
||||
|
||||
|
||||
BIN
assets/7cb0d24f.wav
Normal file
BIN
assets/7cb0d24f.wav
Normal file
Binary file not shown.
@@ -1,37 +1,19 @@
|
||||
# Prompt Example
|
||||
## Text-To-Image
|
||||
Input Example : Generate an image of a horse<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
## Text-To-Audio
|
||||
Input Example : Generate an audio of a piano playing<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
Audio:<br />
|
||||
<audio src="b973e878.wav" controls></audio><br />
|
||||
|
||||
## Text-To-Speech
|
||||
## Speech
|
||||
### Text-To-Speech
|
||||
Input Example : Generate a speech with text "here we go"<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
Audio:<br />
|
||||
<audio src="fd5cf55e.wav" controls></audio><br />
|
||||
|
||||
## Text-To-Sing
|
||||
Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
|
||||
### Style Transfer Text-To-Speech
|
||||
First upload your audio(.wav)<br />
|
||||
Input Example : Speak using the voice of this audio. The text is "here we go".<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
Audio:<br />
|
||||
<audio src="2bf90e35.wav" controls></audio><br />
|
||||
## Image-To-Audio
|
||||
First upload your image(.png)<br />
|
||||
Input Example : Generate the audio of this image<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
Audio:<br />
|
||||
<audio src="5d67d1b9.wav" controls></audio><br />
|
||||
<br />
|
||||
|
||||
## Speech Recognition
|
||||
### Speech Recognition
|
||||
First upload your audio(.wav)<br />
|
||||
Audio Example :<br />
|
||||
<audio src="Track 4.wav" controls></audio><br />
|
||||
@@ -39,16 +21,53 @@ Input Example : Generate the text of this speech<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
|
||||
## Audio-To-Text
|
||||
## Sing
|
||||
### Text-To-Sing
|
||||
Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
Audio:<br />
|
||||
<audio src="2bf90e35.wav" controls></audio><br />
|
||||
|
||||
## Audio
|
||||
### Text-To-Audio
|
||||
Input Example : Generate an audio of a piano playing<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
Audio:<br />
|
||||
<audio src="b973e878.wav" controls></audio><br />
|
||||
|
||||
### Audio Inpainting
|
||||
First upload your audio(.wav)<br />
|
||||
Audio Example :<br />
|
||||
<audio src="drums-and-music-playing-with-a-man-speaking.wav" controls></audio><br />
|
||||
Input Example : I want to inpaint this audio.<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
Then you can press the "Predict Masked Place" button<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
Output Audio:<br />
|
||||
<audio src="7cb0d24f.wav" controls></audio><br />
|
||||
|
||||
### Image-To-Audio
|
||||
First upload your image(.png)<br />
|
||||
Input Example : Generate the audio of this image<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
Audio:<br />
|
||||
<audio src="5d67d1b9.wav" controls></audio><br />
|
||||
|
||||
### Audio-To-Text
|
||||
First upload your audio(.wav)<br />
|
||||
Audio Example :<br />
|
||||
<audio src="a-group-of-sheep-are-baaing.wav" controls></audio><br />
|
||||
Input Example : Please tell me the text description of this audio.<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
## Style Transfer Text-To-Speech
|
||||
First upload your audio(.wav)<br />
|
||||
Input Example : Speak using the voice of this audio. The text is "here we go".<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
|
||||
## Image
|
||||
### Text-To-Image
|
||||
Input Example : Generate an image of a horse<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
|
||||
BIN
assets/drums-and-music-playing-with-a-man-speaking.wav
Normal file
BIN
assets/drums-and-music-playing-with-a-man-speaking.wav
Normal file
Binary file not shown.
BIN
assets/inpaint-1.png
Normal file
BIN
assets/inpaint-1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 645 KiB |
BIN
assets/inpaint-2.png
Normal file
BIN
assets/inpaint-2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 184 KiB |
161
audio-chatgpt.py
161
audio-chatgpt.py
@@ -6,10 +6,11 @@ sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Neura
|
||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
|
||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
|
||||
import gradio as gr
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
|
||||
import matplotlib
|
||||
import librosa
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
|
||||
from langchain.agents.initialize import initialize_agent
|
||||
from langchain.agents.tools import Tool
|
||||
from langchain.chains.conversation.memory import ConversationBufferMemory
|
||||
@@ -17,20 +18,13 @@ from langchain.llms.openai import OpenAI
|
||||
import re
|
||||
import uuid
|
||||
import soundfile
|
||||
from scipy.io import wavfile
|
||||
from diffusers import StableDiffusionInpaintPipeline
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
from omegaconf import OmegaConf
|
||||
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
|
||||
import cv2
|
||||
import einops
|
||||
from pytorch_lightning import seed_everything
|
||||
import random
|
||||
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
|
||||
from einops import repeat
|
||||
from ldm.util import instantiate_from_config
|
||||
from ldm.data.extract_mel_spectrogram import TRANSFORMS_16000
|
||||
from pathlib import Path
|
||||
from vocoder.hifigan.modules import VocoderHifigan
|
||||
from vocoder.bigvgan.models import VocoderBigVGAN
|
||||
from ldm.models.diffusion.ddim import DDIMSampler
|
||||
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
||||
@@ -110,6 +104,15 @@ def initialize_model(config, ckpt, device):
|
||||
sampler = DDIMSampler(model)
|
||||
return sampler
|
||||
|
||||
def initialize_model_inpaint(config, ckpt):
|
||||
config = OmegaConf.load(config)
|
||||
model = instantiate_from_config(config.model)
|
||||
model.load_state_dict(torch.load(ckpt,map_location='cpu')["state_dict"], strict=False)
|
||||
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||
model = model.to(device)
|
||||
print(model.device,device,model.cond_stage_model.device)
|
||||
sampler = DDIMSampler(model)
|
||||
return sampler
|
||||
|
||||
def select_best_audio(prompt,wav_list):
|
||||
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
|
||||
@@ -124,6 +127,7 @@ def select_best_audio(prompt,wav_list):
|
||||
print(score_list,max_index)
|
||||
return wav_list[max_index]
|
||||
|
||||
|
||||
class T2I:
|
||||
def __init__(self, device):
|
||||
print("Initializing T2I to %s" % device)
|
||||
@@ -348,9 +352,10 @@ class Inpaint:
|
||||
def __init__(self, device):
|
||||
print("Initializing Make-An-Audio-inpaint to %s" % device)
|
||||
self.device = device
|
||||
self.sampler = initialize_model('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
|
||||
self.sampler = initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
|
||||
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
|
||||
def make_batch_sd(mel, mask, num_samples=1):
|
||||
self.cmap_transform = matplotlib.cm.viridis
|
||||
def make_batch_sd(self, mel, mask, num_samples=1):
|
||||
|
||||
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
|
||||
mask = torch.from_numpy(mask)[None,None,...].to(dtype=torch.float32)
|
||||
@@ -366,10 +371,11 @@ class Inpaint:
|
||||
"masked_mel": repeat(masked_mel.to(device=self.device), "1 ... -> n ...", n=num_samples),
|
||||
}
|
||||
return batch
|
||||
def gen_mel(input_audio):
|
||||
sr,ori_wav = input_audio
|
||||
def gen_mel(self, input_audio_path):
|
||||
SAMPLE_RATE = 16000
|
||||
sr, ori_wav = wavfile.read(input_audio_path)
|
||||
print("gen_mel")
|
||||
print(sr,ori_wav.shape,ori_wav)
|
||||
|
||||
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储,不用管
|
||||
if len(ori_wav.shape)==2:# stereo
|
||||
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
||||
@@ -385,12 +391,35 @@ class Inpaint:
|
||||
|
||||
mel = TRANSFORMS_16000(input_wav)
|
||||
return mel
|
||||
def show_mel_fn(input_audio):
|
||||
def gen_mel_audio(self, input_audio):
|
||||
SAMPLE_RATE = 16000
|
||||
sr,ori_wav = input_audio
|
||||
print("gen_mel_audio")
|
||||
print(sr,ori_wav.shape,ori_wav)
|
||||
|
||||
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储,不用管
|
||||
if len(ori_wav.shape)==2:# stereo
|
||||
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
||||
print(sr,ori_wav.shape,ori_wav)
|
||||
ori_wav = librosa.resample(ori_wav,orig_sr = sr,target_sr = SAMPLE_RATE)
|
||||
|
||||
mel_len,hop_size = 848,256
|
||||
input_len = mel_len * hop_size
|
||||
if len(ori_wav) < input_len:
|
||||
input_wav = np.pad(ori_wav,(0,mel_len*hop_size),constant_values=0)
|
||||
else:
|
||||
input_wav = ori_wav[:input_len]
|
||||
mel = TRANSFORMS_16000(input_wav)
|
||||
return mel
|
||||
def show_mel_fn(self, input_audio_path):
|
||||
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
||||
crop_mel = self.gen_mel(input_audio)[:,:crop_len]
|
||||
color_mel = cmap_transform(crop_mel)
|
||||
return Image.fromarray((color_mel*255).astype(np.uint8))
|
||||
def inpaint(batch, seed, ddim_steps, num_samples=1, W=512, H=512):
|
||||
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
||||
color_mel = self.cmap_transform(crop_mel)
|
||||
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
||||
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
||||
image.save(image_filename)
|
||||
return image_filename
|
||||
def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
|
||||
model = self.sampler.model
|
||||
|
||||
prng = np.random.RandomState(seed)
|
||||
@@ -411,7 +440,6 @@ class Inpaint:
|
||||
x_samples_ddim = model.decode_first_stage(samples_ddim)
|
||||
|
||||
|
||||
mask = batch["mask"]# [-1,1]
|
||||
mel = torch.clamp((batch["mel"]+1.0)/2.0,min=0.0, max=1.0)
|
||||
mask = torch.clamp((batch["mask"]+1.0)/2.0,min=0.0, max=1.0)
|
||||
predicted_mel = torch.clamp((x_samples_ddim+1.0)/2.0,min=0.0, max=1.0)
|
||||
@@ -420,17 +448,19 @@ class Inpaint:
|
||||
inapint_wav = self.vocoder.vocode(inpainted)
|
||||
|
||||
return inpainted, inapint_wav
|
||||
def predict(input_audio,mel_and_mask,ddim_steps,seed):
|
||||
show_mel = np.array(mel_and_mask['image'].convert("L"))/255 # 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
||||
mask = np.array(mel_and_mask["mask"].convert("L"))/255
|
||||
|
||||
def inference(self, input_audio, mel_and_mask, seed = 55, ddim_steps = 100):
|
||||
SAMPLE_RATE = 16000
|
||||
torch.set_grad_enabled(False)
|
||||
mel_img = Image.open(mel_and_mask['image'])
|
||||
mask_img = Image.open(mel_and_mask["mask"])
|
||||
show_mel = np.array(mel_img.convert("L"))/255 # 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
||||
mask = np.array(mask_img.convert("L"))/255
|
||||
mel_bins,mel_len = 80,848
|
||||
|
||||
input_mel = self.gen_mel(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
||||
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
||||
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)# 将mask填充到原来的mel的大小
|
||||
print(mask.shape,input_mel.shape)
|
||||
with torch.no_grad():
|
||||
batch = make_batch_sd(input_mel,mask,device,num_samples=1)
|
||||
batch = self.make_batch_sd(input_mel,mask,num_samples=1)
|
||||
inpainted,gen_wav = self.inpaint(
|
||||
batch=batch,
|
||||
seed=seed,
|
||||
@@ -439,10 +469,15 @@ class Inpaint:
|
||||
H=mel_bins, W=mel_len
|
||||
)
|
||||
inpainted = inpainted[:,:show_mel.shape[1]]
|
||||
color_mel = cmap_transform(inpainted)
|
||||
color_mel = self.cmap_transform(inpainted)
|
||||
input_len = int(input_audio[1].shape[0] * SAMPLE_RATE / input_audio[0])
|
||||
gen_wav = (gen_wav * 32768).astype(np.int16)[:input_len]
|
||||
return Image.fromarray((color_mel*255).astype(np.uint8)),(SAMPLE_RATE,gen_wav)
|
||||
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
||||
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
||||
image.save(image_filename)
|
||||
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
||||
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
||||
return image_filename, audio_filename
|
||||
|
||||
class ASR:
|
||||
def __init__(self, device):
|
||||
@@ -481,6 +516,7 @@ class ConversationBot:
|
||||
self.i2a = I2A(device="cuda:1")
|
||||
self.a2t = A2T(device="cuda:2")
|
||||
self.asr = ASR(device="cuda:1")
|
||||
self.inpaint = Inpaint(device="cuda:0")
|
||||
self.tts_ood = TTS_OOD(device="cuda:0")
|
||||
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
||||
self.tools = [
|
||||
@@ -513,6 +549,9 @@ class ConversationBot:
|
||||
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
||||
description="useful for when you want to describe an audio in text, receives audio_path as input."
|
||||
"The input to this tool should be a string, representing the audio_path."),
|
||||
Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
||||
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
|
||||
"The input to this tool should be a string, representing the audio_path."),
|
||||
Tool(name="Transcribe speech", func=self.asr.inference,
|
||||
description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
||||
"The input to this tool should be a string, representing the audio_path.")]
|
||||
@@ -536,7 +575,7 @@ class ConversationBot:
|
||||
response = res['output']
|
||||
state = state + [(text, response)]
|
||||
print("Outputs:", state)
|
||||
return state, state, None
|
||||
return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
|
||||
else:
|
||||
tool = res['intermediate_steps'][0][0].tool
|
||||
if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
|
||||
@@ -544,13 +583,23 @@ class ConversationBot:
|
||||
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
||||
state = state + [(text, response)]
|
||||
print("Outputs:", state)
|
||||
return state, state, None
|
||||
return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
|
||||
elif tool == "Audio Inpainting":
|
||||
audio_filename = res['intermediate_steps'][0][0].tool_input
|
||||
image_filename = res['intermediate_steps'][0][1]
|
||||
# self.is_visible(True)
|
||||
print("======>Current memory:\n %s" % self.agent.memory)
|
||||
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
||||
#response = res['output'] + f"<audio src=audio_filename controls=controls></audio>"
|
||||
print(res)
|
||||
response = res['output']
|
||||
state = state + [(text, response)]
|
||||
print("Outputs:", state)
|
||||
return state, state, audio_filename
|
||||
return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
|
||||
print("======>Current memory:\n %s" % self.agent.memory)
|
||||
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
||||
audio_filename = res['intermediate_steps'][0][1]
|
||||
state = state + [(text, response)]
|
||||
print("Outputs:", state)
|
||||
return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(visible=False), gr.Button.update(visible=False)
|
||||
|
||||
def run_image_or_audio(self, file, state, txt):
|
||||
file_type = file.name[-3:]
|
||||
@@ -566,10 +615,11 @@ class ConversationBot:
|
||||
"rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
|
||||
AI_prompt = "Received. "
|
||||
self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
|
||||
print("======>Current memory:\n %s" % self.agent.memory)
|
||||
#state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
|
||||
state = state + [(f"*{audio_filename}*", AI_prompt)]
|
||||
print("Outputs:", state)
|
||||
return state, state, txt + ' ' + audio_filename + ' ', audio_filename
|
||||
return state, state, txt + ' ' + audio_filename + ' ', gr.Audio.update(value=audio_filename,visible=True)
|
||||
else:
|
||||
print("===============Running run_image =============")
|
||||
print("Inputs:", file, state)
|
||||
@@ -592,7 +642,26 @@ class ConversationBot:
|
||||
print("======>Current memory:\n %s" % self.agent.memory)
|
||||
state = state + [(f"*{image_filename}*", AI_prompt)]
|
||||
print("Outputs:", state)
|
||||
return state, state, txt + ' ' + image_filename + ' ', None
|
||||
return state, state, txt + ' ' + image_filename + ' ', gr.Audio.update(visible=False)
|
||||
|
||||
def inpainting(self, state, audio_filename, image_filename):
|
||||
print("===============Running inpainting =============")
|
||||
print("Inputs:", state)
|
||||
print("======>Previous memory:\n %s" % self.agent.memory)
|
||||
inpaint = Inpaint(device="cuda:0")
|
||||
new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
|
||||
AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"*{new_image_filename}*"
|
||||
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
||||
print("======>Current memory:\n %s" % self.agent.memory)
|
||||
state = state + [(f"Audio Inpainting", AI_prompt)]
|
||||
print("Outputs:", state)
|
||||
return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
|
||||
def clear_audio(self):
|
||||
return gr.Audio.update(value=None, visible=False)
|
||||
def clear_image(self):
|
||||
return gr.Image.update(value=None, visible=False)
|
||||
def clear_button(self):
|
||||
return gr.Button.update(visible=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
@@ -610,12 +679,22 @@ if __name__ == '__main__':
|
||||
with gr.Column(scale=0.15, min_width=0):
|
||||
btn = gr.UploadButton("Upload", file_types=["image","audio"])
|
||||
with gr.Column():
|
||||
outaudio = gr.Audio()
|
||||
txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio])
|
||||
outaudio = gr.Audio(visible=False)
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
|
||||
run_button = gr.Button("Predict Masked Place",visible=False)
|
||||
|
||||
|
||||
txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
|
||||
txt.submit(lambda: "", None, txt)
|
||||
btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
|
||||
run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, run_button])
|
||||
clear.click(bot.memory.clear)
|
||||
clear.click(lambda: [], None, chatbot)
|
||||
clear.click(lambda: [], None, state)
|
||||
clear.click(lambda: None, None, outaudio)
|
||||
clear.click(lambda:None, None, txt)
|
||||
clear.click(bot.clear_button, None, run_button)
|
||||
clear.click(bot.clear_image, None, show_mel)
|
||||
clear.click(bot.clear_audio, None, outaudio)
|
||||
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
|
||||
Reference in New Issue
Block a user