This commit is contained in:
lmzjms
2023-03-30 21:48:59 +08:00
parent a869dcef4c
commit 7492e97023
6 changed files with 159 additions and 64 deletions

BIN
assets/7cb0d24f.wav Normal file

Binary file not shown.

View File

@@ -1,37 +1,19 @@
# Prompt Example
## Text-To-Image
Input Example : Generate an image of a horse<br />
Output:<br />
![](t2i.png)<br />
## Text-To-Audio
Input Example : Generate an audio of a piano playing<br />
Output:<br />
![](t2a.png)<br />
Audio:<br />
<audio src="b973e878.wav" controls></audio><br />
## Text-To-Speech
## Speech
### Text-To-Speech
Input Example : Generate a speech with text "here we go"<br />
Output:<br />
![](tts.png)<br />
Audio:<br />
<audio src="fd5cf55e.wav" controls></audio><br />
## Text-To-Sing
Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
### Style Transfer Text-To-Speech
First upload your audio(.wav)<br />
Input Example : Speak using the voice of this audio. The text is "here we go".<br />
Output:<br />
![](t2s.png)<br />
Audio:<br />
<audio src="2bf90e35.wav" controls></audio><br />
## Image-To-Audio
First upload your image(.png)<br />
Input Example : Generate the audio of this image<br />
Output:<br />
![](i2a-2.png)<br />
Audio:<br />
<audio src="5d67d1b9.wav" controls></audio><br />
![](style_transfer_tts.png)<br />
## Speech Recognition
### Speech Recognition
First upload your audio(.wav)<br />
Audio Example :<br />
<audio src="Track 4.wav" controls></audio><br />
@@ -39,16 +21,53 @@ Input Example : Generate the text of this speech<br />
Output:<br />
![](asr.png)<br />
## Audio-To-Text
## Sing
### Text-To-Sing
Input example : please generate a piece of singing voice. Text sequence is 小酒窝长睫毛AP是你最美的记号. Note sequence is C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4. Note duration sequence is 0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340.<br />
Output:<br />
![](t2s.png)<br />
Audio:<br />
<audio src="2bf90e35.wav" controls></audio><br />
## Audio
### Text-To-Audio
Input Example : Generate an audio of a piano playing<br />
Output:<br />
![](t2a.png)<br />
Audio:<br />
<audio src="b973e878.wav" controls></audio><br />
### Audio Inpainting
First upload your audio(.wav)<br />
Audio Example :<br />
<audio src="drums-and-music-playing-with-a-man-speaking.wav" controls></audio><br />
Input Example : I want to inpaint this audio.<br />
Output:<br />
![](inpaint-1.png)<br />
Then you can press the "Predict Masked Place" button<br />
Output:<br />
![](inpaint-2.png)<br />
Output Audio:<br />
<audio src="7cb0d24f.wav" controls></audio><br />
### Image-To-Audio
First upload your image(.png)<br />
Input Example : Generate the audio of this image<br />
Output:<br />
![](i2a-2.png)<br />
Audio:<br />
<audio src="5d67d1b9.wav" controls></audio><br />
### Audio-To-Text
First upload your audio(.wav)<br />
Audio Example :<br />
<audio src="a-group-of-sheep-are-baaing.wav" controls></audio><br />
Input Example : Please tell me the text description of this audio.<br />
Output:<br />
![](a2i.png)<br />
## Style Transfer Text-To-Speech
First upload your audio(.wav)<br />
Input Example : Speak using the voice of this audio. The text is "here we go".<br />
Output:<br />
![](style_transfer_tts.png)<br />
## Image
### Text-To-Image
Input Example : Generate an image of a horse<br />
Output:<br />
![](t2i.png)<br />

Binary file not shown.

BIN
assets/inpaint-1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 645 KiB

BIN
assets/inpaint-2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 184 KiB

View File

@@ -6,6 +6,8 @@ sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
import gradio as gr
import matplotlib
import librosa
from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
import torch
from diffusers import StableDiffusionPipeline
@@ -25,6 +27,7 @@ from omegaconf import OmegaConf
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
import cv2
import einops
from einops import repeat
from pytorch_lightning import seed_everything
import random
from ldm.util import instantiate_from_config
@@ -112,7 +115,15 @@ def initialize_model(config, ckpt, device):
sampler = DDIMSampler(model)
return sampler
def initialize_model_inpaint(config, ckpt):
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt,map_location='cpu')["state_dict"], strict=False)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
print(model.device,device,model.cond_stage_model.device)
sampler = DDIMSampler(model)
return sampler
def select_best_audio(prompt,wav_list):
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
text_embeddings = clap_model.get_text_embeddings([prompt])
@@ -127,7 +138,6 @@ def select_best_audio(prompt,wav_list):
return wav_list[max_index]
class T2I:
def __init__(self, device):
print("Initializing T2I to %s" % device)
@@ -342,9 +352,10 @@ class Inpaint:
def __init__(self, device):
print("Initializing Make-An-Audio-inpaint to %s" % device)
self.device = device
self.sampler = initialize_model('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
self.sampler = initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
def make_batch_sd(mel, mask, num_samples=1):
self.cmap_transform = matplotlib.cm.viridis
def make_batch_sd(self, mel, mask, num_samples=1):
mel = torch.from_numpy(mel)[None,None,...].to(dtype=torch.float32)
mask = torch.from_numpy(mask)[None,None,...].to(dtype=torch.float32)
@@ -360,10 +371,11 @@ class Inpaint:
"masked_mel": repeat(masked_mel.to(device=self.device), "1 ... -> n ...", n=num_samples),
}
return batch
def gen_mel(input_audio):
sr,ori_wav = input_audio
def gen_mel(self, input_audio_path):
SAMPLE_RATE = 16000
sr, ori_wav = wavfile.read(input_audio_path)
print("gen_mel")
print(sr,ori_wav.shape,ori_wav)
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储不用管
if len(ori_wav.shape)==2:# stereo
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
@@ -379,12 +391,35 @@ class Inpaint:
mel = TRANSFORMS_16000(input_wav)
return mel
def show_mel_fn(input_audio):
def gen_mel_audio(self, input_audio):
SAMPLE_RATE = 16000
sr,ori_wav = input_audio
print("gen_mel_audio")
print(sr,ori_wav.shape,ori_wav)
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储不用管
if len(ori_wav.shape)==2:# stereo
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
print(sr,ori_wav.shape,ori_wav)
ori_wav = librosa.resample(ori_wav,orig_sr = sr,target_sr = SAMPLE_RATE)
mel_len,hop_size = 848,256
input_len = mel_len * hop_size
if len(ori_wav) < input_len:
input_wav = np.pad(ori_wav,(0,mel_len*hop_size),constant_values=0)
else:
input_wav = ori_wav[:input_len]
mel = TRANSFORMS_16000(input_wav)
return mel
def show_mel_fn(self, input_audio_path):
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
crop_mel = self.gen_mel(input_audio)[:,:crop_len]
color_mel = cmap_transform(crop_mel)
return Image.fromarray((color_mel*255).astype(np.uint8))
def inpaint(batch, seed, ddim_steps, num_samples=1, W=512, H=512):
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
color_mel = self.cmap_transform(crop_mel)
image = Image.fromarray((color_mel*255).astype(np.uint8))
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
image.save(image_filename)
return image_filename
def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
model = self.sampler.model
prng = np.random.RandomState(seed)
@@ -414,17 +449,19 @@ class Inpaint:
inapint_wav = self.vocoder.vocode(inpainted)
return inpainted, inapint_wav
def predict(input_audio,mel_and_mask,ddim_steps,seed):
show_mel = np.array(mel_and_mask['image'].convert("L"))/255 # 由于展示的mel只展示了一部分所以需要重新从音频生成mel
mask = np.array(mel_and_mask["mask"].convert("L"))/255
def inference(self, input_audio, mel_and_mask, seed = 55, ddim_steps = 100):
SAMPLE_RATE = 16000
torch.set_grad_enabled(False)
mel_img = Image.open(mel_and_mask['image'])
mask_img = Image.open(mel_and_mask["mask"])
show_mel = np.array(mel_img.convert("L"))/255 # 由于展示的mel只展示了一部分所以需要重新从音频生成mel
mask = np.array(mask_img.convert("L"))/255
mel_bins,mel_len = 80,848
input_mel = self.gen_mel(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分所以需要重新从音频生成mel
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分所以需要重新从音频生成mel
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)# 将mask填充到原来的mel的大小
print(mask.shape,input_mel.shape)
with torch.no_grad():
batch = make_batch_sd(input_mel,mask,device,num_samples=1)
batch = self.make_batch_sd(input_mel,mask,num_samples=1)
inpainted,gen_wav = self.inpaint(
batch=batch,
seed=seed,
@@ -433,10 +470,15 @@ class Inpaint:
H=mel_bins, W=mel_len
)
inpainted = inpainted[:,:show_mel.shape[1]]
color_mel = cmap_transform(inpainted)
color_mel = self.cmap_transform(inpainted)
input_len = int(input_audio[1].shape[0] * SAMPLE_RATE / input_audio[0])
gen_wav = (gen_wav * 32768).astype(np.int16)[:input_len]
return Image.fromarray((color_mel*255).astype(np.uint8)),(SAMPLE_RATE,gen_wav)
image = Image.fromarray((color_mel*255).astype(np.uint8))
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
image.save(image_filename)
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
soundfile.write(audio_filename, gen_wav, samplerate = 16000)
return image_filename, audio_filename
class ASR:
def __init__(self, device):
@@ -474,7 +516,8 @@ class ConversationBot:
self.i2a = I2A(device="cuda:1")
self.a2t = A2T(device="cuda:2")
self.asr = ASR(device="cuda:1")
self.tts_ood = TTS_OOD(device="cuda:0")
self.inpaint = Inpaint(device="cuda:0")
#self.tts_ood = TTS_OOD(device="cuda:0")
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
self.tools = [
Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
@@ -486,11 +529,11 @@ class ConversationBot:
Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
description="useful for when you want to generate an audio from a user input text and it saved it to a file."
"The input to this tool should be a string, representing the text used to generate audio."),
Tool(
name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
"Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
"The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
# Tool(
# name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
# description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
# "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
# "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
"If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
@@ -506,6 +549,9 @@ class ConversationBot:
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
description="useful for when you want to describe an audio in text, receives audio_path as input."
"The input to this tool should be a string, representing the audio_path."),
Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
"The input to this tool should be a string, representing the audio_path."),
Tool(name="Transcribe speech", func=self.asr.inference,
description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
"The input to this tool should be a string, representing the audio_path.")]
@@ -529,7 +575,7 @@ class ConversationBot:
response = res['output']
state = state + [(text, response)]
print("Outputs:", state)
return state, state, None
return state, state, None, None
else:
tool = res['intermediate_steps'][0][0].tool
if tool == "Generate Image From User Input Text" or tool == "Generate Text From The Audio" or tool == "Transcribe speech":
@@ -537,13 +583,22 @@ class ConversationBot:
response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
state = state + [(text, response)]
print("Outputs:", state)
return state, state, None
return state, state, None, None
elif tool == "Audio Inpainting":
audio_filename = res['intermediate_steps'][0][0].tool_input
image_filename = res['intermediate_steps'][0][1]
print("======>Current memory:\n %s" % self.agent.memory)
response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
#response = res['output'] + f"<audio src=audio_filename controls=controls></audio>"
print(res)
response = res['output']
state = state + [(text, response)]
print("Outputs:", state)
return state, state, audio_filename
return state, state, audio_filename, image_filename
print("======>Current memory:\n %s" % self.agent.memory)
response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
audio_filename = res['intermediate_steps'][0][1]
state = state + [(text, response)]
print("Outputs:", state)
return state, state, audio_filename, None
def run_image_or_audio(self, file, state, txt):
file_type = file.name[-3:]
@@ -559,6 +614,7 @@ class ConversationBot:
"rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
AI_prompt = "Received. "
self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
print("======>Current memory:\n %s" % self.agent.memory)
#state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
state = state + [(f"*{audio_filename}*", AI_prompt)]
print("Outputs:", state)
@@ -587,6 +643,19 @@ class ConversationBot:
print("Outputs:", state)
return state, state, txt + ' ' + image_filename + ' ', None
def inpainting(self, state, audio_filename, image_filename):
print("===============Running inpainting =============")
print("Inputs:", state)
print("======>Previous memory:\n %s" % self.agent.memory)
inpaint = Inpaint(device="cuda:0")
new_image_filename, new_audio_filename = inpaint.inference(audio_filename, image_filename)
AI_prompt = "Here are the predict audio and the mel spectrum." + f"*{new_audio_filename}*" + f"![](/file={new_image_filename})*{new_image_filename}*"
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
print("======>Current memory:\n %s" % self.agent.memory)
state = state + [(f"Audio Inpainting", AI_prompt)]
print("Outputs:", state)
return state, state, None, new_audio_filename
if __name__ == '__main__':
bot = ConversationBot()
@@ -604,11 +673,18 @@ if __name__ == '__main__':
btn = gr.UploadButton("Upload", file_types=["image","audio"])
with gr.Column():
outaudio = gr.Audio()
txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio])
with gr.Row():
with gr.Column():
show_mel = gr.Image(type="filepath",tool='sketch')
run_button = gr.Button("Predict Masked Place")
txt.submit(bot.run_text, [txt, state], [chatbot, state, outaudio, show_mel])
txt.submit(lambda: "", None, txt)
btn.upload(bot.run_image_or_audio, [btn, state, txt], [chatbot, state, txt, outaudio])
run_button.click(bot.inpainting, [state, outaudio, show_mel], [chatbot, state, show_mel, outaudio])
clear.click(bot.memory.clear)
clear.click(lambda: [], None, chatbot)
clear.click(lambda: [], None, state)
clear.click(lambda: None, None, show_mel)
clear.click(lambda: None, None, outaudio)
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)