mirror of
https://github.com/AIGC-Audio/AudioGPT.git
synced 2025-12-16 20:07:58 +01:00
Merge branch 'main' into hzq
# Conflicts: # audio-chatgpt.py
This commit is contained in:
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
## Capabilities
|
## Capabilities
|
||||||
|
|
||||||
Up-to-date link: https://eac422a9e2289d6b.gradio.app/
|
Up-to-date link: https://93868c7fa583f4b5.gradio.app
|
||||||
|
|
||||||
Here we list the capability of AudioGPT at this time. More supported models and tasks are comming soon. For prompt examples, refer to [asset](assets/README.md).
|
Here we list the capability of AudioGPT at this time. More supported models and tasks are comming soon. For prompt examples, refer to [asset](assets/README.md).
|
||||||
|
|
||||||
|
|||||||
BIN
assets/7cb0d24f.wav
Normal file
BIN
assets/7cb0d24f.wav
Normal file
Binary file not shown.
@@ -1,37 +1,19 @@
|
|||||||
# Prompt Example
|
# Prompt Example
|
||||||
## Text-To-Image
|
## Speech
|
||||||
Input Example : Generate an image of a horse<br />
|
### Text-To-Speech
|
||||||
Output:<br />
|
|
||||||
<br />
|
|
||||||
## Text-To-Audio
|
|
||||||
Input Example : Generate an audio of a piano playing<br />
|
|
||||||
Output:<br />
|
|
||||||
<br />
|
|
||||||
Audio:<br />
|
|
||||||
<audio src="b973e878.wav" controls></audio><br />
|
|
||||||
|
|
||||||
## Text-To-Speech
|
|
||||||
Input Example : Generate a speech with text "here we go"<br />
|
Input Example : Generate a speech with text "here we go"<br />
|
||||||
Output:<br />
|
Output:<br />
|
||||||
<br />
|
<br />
|
||||||
Audio:<br />
|
Audio:<br />
|
||||||
<audio src="fd5cf55e.wav" controls></audio><br />
|
<audio src="fd5cf55e.wav" controls></audio><br />
|
||||||
|
|
||||||
## Text-To-Sing
|
### Style Transfer Text-To-Speech
|
||||||
Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
|
First upload your audio(.wav)<br />
|
||||||
|
Input Example : Speak using the voice of this audio. The text is "here we go".<br />
|
||||||
Output:<br />
|
Output:<br />
|
||||||
<br />
|
<br />
|
||||||
Audio:<br />
|
|
||||||
<audio src="2bf90e35.wav" controls></audio><br />
|
|
||||||
## Image-To-Audio
|
|
||||||
First upload your image(.png)<br />
|
|
||||||
Input Example : Generate the audio of this image<br />
|
|
||||||
Output:<br />
|
|
||||||
<br />
|
|
||||||
Audio:<br />
|
|
||||||
<audio src="5d67d1b9.wav" controls></audio><br />
|
|
||||||
|
|
||||||
## Speech Recognition
|
### Speech Recognition
|
||||||
First upload your audio(.wav)<br />
|
First upload your audio(.wav)<br />
|
||||||
Audio Example :<br />
|
Audio Example :<br />
|
||||||
<audio src="Track 4.wav" controls></audio><br />
|
<audio src="Track 4.wav" controls></audio><br />
|
||||||
@@ -39,16 +21,53 @@ Input Example : Generate the text of this speech<br />
|
|||||||
Output:<br />
|
Output:<br />
|
||||||
<br />
|
<br />
|
||||||
|
|
||||||
## Audio-To-Text
|
## Sing
|
||||||
|
### Text-To-Sing
|
||||||
|
Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
|
||||||
|
Output:<br />
|
||||||
|
<br />
|
||||||
|
Audio:<br />
|
||||||
|
<audio src="2bf90e35.wav" controls></audio><br />
|
||||||
|
|
||||||
|
## Audio
|
||||||
|
### Text-To-Audio
|
||||||
|
Input Example : Generate an audio of a piano playing<br />
|
||||||
|
Output:<br />
|
||||||
|
<br />
|
||||||
|
Audio:<br />
|
||||||
|
<audio src="b973e878.wav" controls></audio><br />
|
||||||
|
|
||||||
|
### Audio Inpainting
|
||||||
|
First upload your audio(.wav)<br />
|
||||||
|
Audio Example :<br />
|
||||||
|
<audio src="drums-and-music-playing-with-a-man-speaking.wav" controls></audio><br />
|
||||||
|
Input Example : I want to inpaint this audio.<br />
|
||||||
|
Output:<br />
|
||||||
|
<br />
|
||||||
|
Then you can press the "Predict Masked Place" button<br />
|
||||||
|
Output:<br />
|
||||||
|
<br />
|
||||||
|
Output Audio:<br />
|
||||||
|
<audio src="7cb0d24f.wav" controls></audio><br />
|
||||||
|
|
||||||
|
### Image-To-Audio
|
||||||
|
First upload your image(.png)<br />
|
||||||
|
Input Example : Generate the audio of this image<br />
|
||||||
|
Output:<br />
|
||||||
|
<br />
|
||||||
|
Audio:<br />
|
||||||
|
<audio src="5d67d1b9.wav" controls></audio><br />
|
||||||
|
|
||||||
|
### Audio-To-Text
|
||||||
First upload your audio(.wav)<br />
|
First upload your audio(.wav)<br />
|
||||||
Audio Example :<br />
|
Audio Example :<br />
|
||||||
<audio src="a-group-of-sheep-are-baaing.wav" controls></audio><br />
|
<audio src="a-group-of-sheep-are-baaing.wav" controls></audio><br />
|
||||||
Input Example : Please tell me the text description of this audio.<br />
|
Input Example : Please tell me the text description of this audio.<br />
|
||||||
Output:<br />
|
Output:<br />
|
||||||
<br />
|
<br />
|
||||||
## Style Transfer Text-To-Speech
|
|
||||||
First upload your audio(.wav)<br />
|
|
||||||
Input Example : Speak using the voice of this audio. The text is "here we go".<br />
|
|
||||||
Output:<br />
|
|
||||||
<br />
|
|
||||||
|
|
||||||
|
## Image
|
||||||
|
### Text-To-Image
|
||||||
|
Input Example : Generate an image of a horse<br />
|
||||||
|
Output:<br />
|
||||||
|
<br />
|
||||||
|
|||||||
BIN
assets/drums-and-music-playing-with-a-man-speaking.wav
Normal file
BIN
assets/drums-and-music-playing-with-a-man-speaking.wav
Normal file
Binary file not shown.
BIN
assets/inpaint-1.png
Normal file
BIN
assets/inpaint-1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 645 KiB |
BIN
assets/inpaint-2.png
Normal file
BIN
assets/inpaint-2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 184 KiB |
161
audio-chatgpt.py
161
audio-chatgpt.py
@@ -6,10 +6,11 @@ sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Neura
|
|||||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
|
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
|
||||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
|
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
|
import matplotlib
|
||||||
|
import librosa
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
import torch
|
import torch
|
||||||
from diffusers import StableDiffusionPipeline
|
from diffusers import StableDiffusionPipeline
|
||||||
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
|
|
||||||
from langchain.agents.initialize import initialize_agent
|
from langchain.agents.initialize import initialize_agent
|
||||||
from langchain.agents.tools import Tool
|
from langchain.agents.tools import Tool
|
||||||
from langchain.chains.conversation.memory import ConversationBufferMemory
|
from langchain.chains.conversation.memory import ConversationBufferMemory
|
||||||
@@ -17,20 +18,13 @@ from langchain.llms.openai import OpenAI
|
|||||||
import re
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
import soundfile
|
import soundfile
|
||||||
from scipy.io import wavfile
|
|
||||||
from diffusers import StableDiffusionInpaintPipeline
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from omegaconf import OmegaConf
|
from omegaconf import OmegaConf
|
||||||
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
|
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
|
||||||
import cv2
|
from einops import repeat
|
||||||
import einops
|
|
||||||
from pytorch_lightning import seed_everything
|
|
||||||
import random
|
|
||||||
from ldm.util import instantiate_from_config
|
from ldm.util import instantiate_from_config
|
||||||
from ldm.data.extract_mel_spectrogram import TRANSFORMS_16000
|
from ldm.data.extract_mel_spectrogram import TRANSFORMS_16000
|
||||||
from pathlib import Path
|
|
||||||
from vocoder.hifigan.modules import VocoderHifigan
|
|
||||||
from vocoder.bigvgan.models import VocoderBigVGAN
|
from vocoder.bigvgan.models import VocoderBigVGAN
|
||||||
from ldm.models.diffusion.ddim import DDIMSampler
|
from ldm.models.diffusion.ddim import DDIMSampler
|
||||||
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
||||||
@@ -110,6 +104,15 @@ def initialize_model(config, ckpt, device):
|
|||||||
sampler = DDIMSampler(model)
|
sampler = DDIMSampler(model)
|
||||||
return sampler
|
return sampler
|
||||||
|
|
||||||
|
def initialize_model_inpaint(config, ckpt):
|
||||||
|
config = OmegaConf.load(config)
|
||||||
|
model = instantiate_from_config(config.model)
|
||||||
|
model.load_state_dict(torch.load(ckpt,map_location='cpu')["state_dict"], strict=False)
|
||||||
|
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||||
|
model = model.to(device)
|
||||||
|
print(model.device,device,model.cond_stage_model.device)
|
||||||
|
sampler = DDIMSampler(model)
|
||||||
|
return sampler
|
||||||
|
|
||||||
def select_best_audio(prompt,wav_list):
|
def select_best_audio(prompt,wav_list):
|
||||||
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
|
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
|
||||||
@@ -124,6 +127,7 @@ def select_best_audio(prompt,wav_list):
|
|||||||
print(score_list,max_index)
|
print(score_list,max_index)
|
||||||
return wav_list[max_index]
|
return wav_list[max_index]
|
||||||
|
|
||||||
|
|
||||||
class T2I:
|
class T2I:
|
||||||
def __init__(self, device):
|
def __init__(self, device):
|
||||||
print("Initializing T2I to %s" % device)
|
print("Initializing T2I to %s" % device)
|
||||||
@@ -348,9 +352,10 @@ class Inpaint:
|
|||||||
def __init__(self, device):
|
def __init__(self, device):
|
||||||
print("Initializing Make-An-Audio-inpaint to %s" % device)
|
print("Initializing Make-An-Audio-inpaint to %s" % device)
|
||||||
self.device = device
|
self.device = device
|
||||||
self.sampler = initialize_model('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
|
self.sampler = initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
|
||||||
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
|
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
|
||||||
def make_batch_sd(mel, mask, num_samples=1):
|
self.cmap_transform = matplotlib.cm.viridis
|
||||||
|
def make_batch_sd(self, mel, mask, num_samples=1):
|
||||||
|
|
||||||
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
|
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
|
||||||
mask = torch.from_numpy(mask)[None,None,...].to(dtype=torch.float32)
|
mask = torch.from_numpy(mask)[None,None,...].to(dtype=torch.float32)
|
||||||
@@ -366,10 +371,11 @@ class Inpaint:
|
|||||||
"masked_mel": repeat(masked_mel.to(device=self.device), "1 ... -> n ...", n=num_samples),
|
"masked_mel": repeat(masked_mel.to(device=self.device), "1 ... -> n ...", n=num_samples),
|
||||||
}
|
}
|
||||||
return batch
|
return batch
|
||||||
def gen_mel(input_audio):
|
def gen_mel(self, input_audio_path):
|
||||||
sr,ori_wav = input_audio
|
SAMPLE_RATE = 16000
|
||||||
|
sr, ori_wav = wavfile.read(input_audio_path)
|
||||||
|
print("gen_mel")
|
||||||
print(sr,ori_wav.shape,ori_wav)
|
print(sr,ori_wav.shape,ori_wav)
|
||||||
|
|
||||||
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储,不用管
|
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储,不用管
|
||||||
if len(ori_wav.shape)==2:# stereo
|
if len(ori_wav.shape)==2:# stereo
|
||||||
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
||||||
@@ -385,12 +391,35 @@ class Inpaint:
|
|||||||
|
|
||||||
mel = TRANSFORMS_16000(input_wav)
|
mel = TRANSFORMS_16000(input_wav)
|
||||||
return mel
|
return mel
|
||||||
def show_mel_fn(input_audio):
|
def gen_mel_audio(self, input_audio):
|
||||||
|
SAMPLE_RATE = 16000
|
||||||
|
sr,ori_wav = input_audio
|
||||||
|
print("gen_mel_audio")
|
||||||
|
print(sr,ori_wav.shape,ori_wav)
|
||||||
|
|
||||||
|
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储,不用管
|
||||||
|
if len(ori_wav.shape)==2:# stereo
|
||||||
|
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
||||||
|
print(sr,ori_wav.shape,ori_wav)
|
||||||
|
ori_wav = librosa.resample(ori_wav,orig_sr = sr,target_sr = SAMPLE_RATE)
|
||||||
|
|
||||||
|
mel_len,hop_size = 848,256
|
||||||
|
input_len = mel_len * hop_size
|
||||||
|
if len(ori_wav) < input_len:
|
||||||
|
input_wav = np.pad(ori_wav,(0,mel_len*hop_size),constant_values=0)
|
||||||
|
else:
|
||||||
|
input_wav = ori_wav[:input_len]
|
||||||
|
mel = TRANSFORMS_16000(input_wav)
|
||||||
|
return mel
|
||||||
|
def show_mel_fn(self, input_audio_path):
|
||||||
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
||||||
crop_mel = self.gen_mel(input_audio)[:,:crop_len]
|
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
||||||
color_mel = cmap_transform(crop_mel)
|
color_mel = self.cmap_transform(crop_mel)
|
||||||
return Image.fromarray((color_mel*255).astype(np.uint8))
|
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
||||||
def inpaint(batch, seed, ddim_steps, num_samples=1, W=512, H=512):
|
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
||||||
|
image.save(image_filename)
|
||||||
|
return image_filename
|
||||||
|
def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
|
||||||
model = self.sampler.model
|
model = self.sampler.model
|
||||||
|
|
||||||
prng = np.random.RandomState(seed)
|
prng = np.random.RandomState(seed)
|
||||||
@@ -411,7 +440,6 @@ class Inpaint:
|
|||||||
x_samples_ddim = model.decode_first_stage(samples_ddim)
|
x_samples_ddim = model.decode_first_stage(samples_ddim)
|
||||||
|
|
||||||
|
|
||||||
mask = batch["mask"]# [-1,1]
|
|
||||||
mel = torch.clamp((batch["mel"]+1.0)/2.0,min=0.0, max=1.0)
|
mel = torch.clamp((batch["mel"]+1.0)/2.0,min=0.0, max=1.0)
|
||||||
mask = torch.clamp((batch["mask"]+1.0)/2.0,min=0.0, max=1.0)
|
mask = torch.clamp((batch["mask"]+1.0)/2.0,min=0.0, max=1.0)
|
||||||
predicted_mel = torch.clamp((x_samples_ddim+1.0)/2.0,min=0.0, max=1.0)
|
predicted_mel = torch.clamp((x_samples_ddim+1.0)/2.0,min=0.0, max=1.0)
|
||||||
@@ -420,17 +448,19 @@ class Inpaint:
|
|||||||
inapint_wav = self.vocoder.vocode(inpainted)
|
inapint_wav = self.vocoder.vocode(inpainted)
|
||||||
|
|
||||||
return inpainted, inapint_wav
|
return inpainted, inapint_wav
|
||||||
def predict(input_audio,mel_and_mask,ddim_steps,seed):
|
def inference(self, input_audio, mel_and_mask, seed = 55, ddim_steps = 100):
|
||||||
show_mel = np.array(mel_and_mask['image'].convert("L"))/255 # 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
SAMPLE_RATE = 16000
|
||||||
mask = np.array(mel_and_mask["mask"].convert("L"))/255
|
torch.set_grad_enabled(False)
|
||||||
|
mel_img = Image.open(mel_and_mask['image'])
|
||||||
|
mask_img = Image.open(mel_and_mask["mask"])
|
||||||
|
show_mel = np.array(mel_img.convert("L"))/255 # 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
||||||
|
mask = np.array(mask_img.convert("L"))/255
|
||||||
mel_bins,mel_len = 80,848
|
mel_bins,mel_len = 80,848
|
||||||
|
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
||||||
input_mel = self.gen_mel(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
|
||||||
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)# 将mask填充到原来的mel的大小
|
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)# 将mask填充到原来的mel的大小
|
||||||
print(mask.shape,input_mel.shape)
|
print(mask.shape,input_mel.shape)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
batch = make_batch_sd(input_mel,mask,device,num_samples=1)
|
batch = self.make_batch_sd(input_mel,mask,num_samples=1)
|
||||||
inpainted,gen_wav = self.inpaint(
|
inpainted,gen_wav = self.inpaint(
|
||||||
batch=batch,
|
batch=batch,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
@@ -439,11 +469,16 @@ class Inpaint:
|
|||||||
H=mel_bins, W=mel_len
|
H=mel_bins, W=mel_len
|
||||||
)
|
)
|
||||||
inpainted = inpainted[:,:show_mel.shape[1]]
|
inpainted = inpainted[:,:show_mel.shape[1]]
|
||||||
color_mel = cmap_transform(inpainted)
|
color_mel = self.cmap_transform(inpainted)
|
||||||
input_len = int(input_audio[1].shape[0] * SAMPLE_RATE / input_audio[0])
|
input_len = int(input_audio[1].shape[0] * SAMPLE_RATE / input_audio[0])
|
||||||
gen_wav = (gen_wav * 32768).astype(np.int16)[:input_len]
|
gen_wav = (gen_wav * 32768).astype(np.int16)[:input_len]
|
||||||
return Image.fromarray((color_mel*255).astype(np.uint8)),(SAMPLE_RATE,gen_wav)
|
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
||||||
|
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
|
||||||
|
image.save(image_filename)
|
||||||
|
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
||||||
|
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
|
||||||
|
return image_filename, audio_filename
|
||||||
|
|
||||||
class ASR:
|
class ASR:
|
||||||
def __init__(self, device):
|
def __init__(self, device):
|
||||||
print("Initializing Whisper to %s" % device)
|
print("Initializing Whisper to %s" % device)
|
||||||
@@ -481,6 +516,7 @@ class ConversationBot:
|
|||||||
self.i2a = I2A(device="cuda:1")
|
self.i2a = I2A(device="cuda:1")
|
||||||
self.a2t = A2T(device="cuda:2")
|
self.a2t = A2T(device="cuda:2")
|
||||||
self.asr = ASR(device="cuda:1")
|
self.asr = ASR(device="cuda:1")
|
||||||
|
self.inpaint = Inpaint(device="cuda:0")
|
||||||
self.tts_ood = TTS_OOD(device="cuda:0")
|
self.tts_ood = TTS_OOD(device="cuda:0")
|
||||||
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
||||||
self.tools = [
|
self.tools = [
|
||||||
@@ -513,6 +549,9 @@ class ConversationBot:
|
|||||||
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
||||||
description="useful for when you want to describe an audio in text, receives audio_path as input."
|
description="useful for when you want to describe an audio in text, receives audio_path as input."
|
||||||
"The input to this tool should be a string, representing the audio_path."),
|
"The input to this tool should be a string, representing the audio_path."),
|
||||||
|
Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
||||||
|
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
|
||||||
|
"The input to this tool should be a string, representing the audio_path."),
|
||||||
Tool(name="Transcribe speech", func=self.asr.inference,
|
Tool(name="Transcribe speech", func=self.asr.inference,
|
||||||
description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
||||||
"The input to this tool should be a string, representing the audio_path.")]
|
"The input to this tool should be a string, representing the audio_path.")]
|
||||||
@@ -536,7 +575,7 @@ class ConversationBot:
|
|||||||
response = res['output']
|
response = res['output']
|
||||||
state = state + [(text, response)]
|
state = state + [(text, response)]
|
||||||
print("Outputs:", state)
|
print("Outputs:", state)
|
||||||
return state, state, None
|
return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
|
||||||
else:
|
else:
|
||||||
tool = res['intermediate_steps'][0][0].tool
|
tool = res['intermediate_steps'][0][0].tool
|
||||||
if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
|
if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
|
||||||
@@ -544,13 +583,23 @@ class ConversationBot:
|
|||||||
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
||||||
state = state + [(text, response)]
|
state = state + [(text, response)]
|
||||||
print("Outputs:", state)
|
print("Outputs:", state)
|
||||||
return state, state, None
|
return state, state, gr.Audio.update(visible=False), gr.Image.update(visible=False), gr.Button.update(visible=False)
|
||||||
|
elif tool == "Audio Inpainting":
|
||||||
|
audio_filename = res['intermediate_steps'][0][0].tool_input
|
||||||
|
image_filename = res['intermediate_steps'][0][1]
|
||||||
|
# self.is_visible(True)
|
||||||
|
print("======>Current memory:\n %s" % self.agent.memory)
|
||||||
|
print(res)
|
||||||
|
response = res['output']
|
||||||
|
state = state + [(text, response)]
|
||||||
|
print("Outputs:", state)
|
||||||
|
return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(value=image_filename,visible=True), gr.Button.update(visible=True)
|
||||||
print("======>Current memory:\n %s" % self.agent.memory)
|
print("======>Current memory:\n %s" % self.agent.memory)
|
||||||
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
response = re.sub('(image/\S*png)', lambda m: f'})*{m.group(0)}*', res['output'])
|
||||||
#response = res['output'] + f"<audio src=audio_filename controls=controls></audio>"
|
audio_filename = res['intermediate_steps'][0][1]
|
||||||
state = state + [(text, response)]
|
state = state + [(text, response)]
|
||||||
print("Outputs:", state)
|
print("Outputs:", state)
|
||||||
return state, state, audio_filename
|
return state, state, gr.Audio.update(value=audio_filename,visible=True), gr.Image.update(visible=False), gr.Button.update(visible=False)
|
||||||
|
|
||||||
def run_image_or_audio(self, file, state, txt):
|
def run_image_or_audio(self, file, state, txt):
|
||||||
file_type = file.name[-3:]
|
file_type = file.name[-3:]
|
||||||
@@ -566,10 +615,11 @@ class ConversationBot:
|
|||||||
"rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
|
"rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
|
||||||
AI_prompt = "Received. "
|
AI_prompt = "Received. "
|
||||||
self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
|
self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
|
||||||
|
print("======>Current memory:\n %s" % self.agent.memory)
|
||||||
#state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
|
#state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
|
||||||
state = state + [(f"*{audio_filename}*", AI_prompt)]
|
state = state + [(f"*{audio_filename}*", AI_prompt)]
|
||||||
print("Outputs:", state)
|
print("Outputs:", state)
|
||||||
return state, state, txt + ' ' + audio_filename + ' ', audio_filename
|
return state, state, txt + ' ' + audio_filename + ' ', gr.Audio.update(value=audio_filename,visible=True)
|
||||||
else:
|
else:
|
||||||
print("===============Running run_image =============")
|
print("===============Running run_image =============")
|
||||||
print("Inputs:", file, state)
|
print("Inputs:", file, state)
|
||||||
@@ -592,7 +642,26 @@ class ConversationBot:
|
|||||||
print("======>Current memory:\n %s" % self.agent.memory)
|
print("======>Current memory:\n %s" % self.agent.memory)
|
||||||
state = state + [(f"*{image_filename}*", AI_prompt)]
|
state = state + [(f"*{image_filename}*", AI_prompt)]
|
||||||
print("Outputs:", state)
|
print("Outputs:", state)
|
||||||
return state, state, txt + ' ' + image_filename + ' ', None
|
return state, state, txt + ' ' + image_filename + ' ', gr.Audio.update(visible=False)
|
||||||
|
|
||||||
|
def inpainting(self, state, audio_filename, image_filename):
|
||||||
|
print("===============Running inpainting =============")
|
||||||
|
print("Inputs:", state)
|
||||||
|
print("======>Previous memory:\n %s" % self.agent.memory)
|
||||||
|
inpaint = Inpaint(device="cuda:0")
|
||||||
|
new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
|
||||||
|
AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"*{new_image_filename}*"
|
||||||
|
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
||||||
|
print("======>Current memory:\n %s" % self.agent.memory)
|
||||||
|
state = state + [(f"Audio Inpainting", AI_prompt)]
|
||||||
|
print("Outputs:", state)
|
||||||
|
return state, state, gr.Image.update(visible=False), gr.Audio.update(value=new_audio_filename, visible=True), gr.Button.update(visible=False)
|
||||||
|
def clear_audio(self):
|
||||||
|
return gr.Audio.update(value=None, visible=False)
|
||||||
|
def clear_image(self):
|
||||||
|
return gr.Image.update(value=None, visible=False)
|
||||||
|
def clear_button(self):
|
||||||
|
return gr.Button.update(visible=False)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@@ -610,12 +679,22 @@ if __name__ == '__main__':
|
|||||||
with gr.Column(scale=0.15, min_width=0):
|
with gr.Column(scale=0.15, min_width=0):
|
||||||
btn = gr.UploadButton("Upload", file_types=["image","audio"])
|
btn = gr.UploadButton("Upload", file_types=["image","audio"])
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
outaudio = gr.Audio()
|
outaudio = gr.Audio(visible=False)
|
||||||
txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio])
|
with gr.Row():
|
||||||
|
with gr.Column():
|
||||||
|
show_mel = gr.Image(type="filepath",tool='sketch',visible=False)
|
||||||
|
run_button = gr.Button("Predict Masked Place",visible=False)
|
||||||
|
|
||||||
|
|
||||||
|
txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel, run_button])
|
||||||
txt.submit(lambda: "", None, txt)
|
txt.submit(lambda: "", None, txt)
|
||||||
btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
|
btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
|
||||||
|
run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio, run_button])
|
||||||
clear.click(bot.memory.clear)
|
clear.click(bot.memory.clear)
|
||||||
clear.click(lambda: [], None, chatbot)
|
clear.click(lambda: [], None, chatbot)
|
||||||
clear.click(lambda: [], None, state)
|
clear.click(lambda: [], None, state)
|
||||||
clear.click(lambda: None, None, outaudio)
|
clear.click(lambda:None, None, txt)
|
||||||
|
clear.click(bot.clear_button, None, run_button)
|
||||||
|
clear.click(bot.clear_image, None, show_mel)
|
||||||
|
clear.click(bot.clear_audio, None, outaudio)
|
||||||
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
|
||||||
Reference in New Issue
Block a user