mirror of
https://github.com/AIGC-Audio/AudioGPT.git
synced 2025-12-16 20:07:58 +01:00
Update audio-chatgpt.py
This commit is contained in:
@@ -2,6 +2,8 @@ import sys
|
|||||||
import os
|
import os
|
||||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_sing/DiffSinger'))
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text-to-audio/MakeAnAudio'))
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
|
from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
|
||||||
import torch
|
import torch
|
||||||
@@ -29,7 +31,7 @@ from pathlib import Path
|
|||||||
from vocoder.hifigan.modules import VocoderHifigan
|
from vocoder.hifigan.modules import VocoderHifigan
|
||||||
from ldm.models.diffusion.ddim import DDIMSampler
|
from ldm.models.diffusion.ddim import DDIMSampler
|
||||||
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
||||||
|
from inference.svs.ds_e2e import DiffSingerE2EInfer
|
||||||
|
|
||||||
AUDIO_CHATGPT_PREFIX = """Audio ChatGPT
|
AUDIO_CHATGPT_PREFIX = """Audio ChatGPT
|
||||||
|
|
||||||
@@ -224,6 +226,32 @@ class T2A:
|
|||||||
print(f"Processed T2I.run, text: {text}, audio_filename: {audio_filename}")
|
print(f"Processed T2I.run, text: {text}, audio_filename: {audio_filename}")
|
||||||
return audio_filename
|
return audio_filename
|
||||||
|
|
||||||
|
class T2S:
|
||||||
|
def __init__(self, device= None):
|
||||||
|
if device is None:
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
print("Initializing DiffSinger to %s" % device)
|
||||||
|
self.device = device
|
||||||
|
exp_name = 'text_to_sing/DiffSinger/checkpoints/0831_opencpop_ds1000'
|
||||||
|
config= 'text_to_sing/DiffSinger/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml'
|
||||||
|
from utils.hparams import set_hparams
|
||||||
|
from utils.hparams import hparams as hp
|
||||||
|
set_hparams(config= config,exp_name=exp_name, print_hparams=False)
|
||||||
|
self.hp = hp
|
||||||
|
self.pipe = DiffSingerE2EInfer(self.hp)
|
||||||
|
|
||||||
|
def inference(self, inputs):
|
||||||
|
global temp_audio_filename
|
||||||
|
key = ['text', 'notes', 'notes_duration']
|
||||||
|
val = inputs.split(",")
|
||||||
|
inp = {k:v for k,v in zip(key,val)}
|
||||||
|
wav = self.pipe.infer_once(inp)
|
||||||
|
wav *= 32767
|
||||||
|
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
||||||
|
temp_audio_filename = audio_filename
|
||||||
|
soundfile.write(audio_filename, wav.astype(np.int16), self.hp['audio_sample_rate'])
|
||||||
|
print(f"Processed T2S.run, text: {val[0]}, notes: {val[1]}, notes duration: {val[2]}, audio_filename: {audio_filename}")
|
||||||
|
return temp_audio_filename
|
||||||
|
|
||||||
class ConversationBot:
|
class ConversationBot:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -232,6 +260,7 @@ class ConversationBot:
|
|||||||
|
|
||||||
self.t2i = T2I(device="cuda:0")
|
self.t2i = T2I(device="cuda:0")
|
||||||
self.t2a = T2A(device="cuda:0")
|
self.t2a = T2A(device="cuda:0")
|
||||||
|
self.t2s = T2S(device="cuda:0")
|
||||||
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
||||||
self.tools = [
|
self.tools = [
|
||||||
Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
|
Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
|
||||||
@@ -239,7 +268,13 @@ class ConversationBot:
|
|||||||
"The input to this tool should be a string, representing the text used to generate image. "),
|
"The input to this tool should be a string, representing the text used to generate image. "),
|
||||||
Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
|
Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
|
||||||
description="useful for when you want to generate an audio from a user input text and it saved it to a file."
|
description="useful for when you want to generate an audio from a user input text and it saved it to a file."
|
||||||
"The input to this tool should be a string, representing the text used to generate audio.")]
|
"The input to this tool should be a string, representing the text used to generate audio."),
|
||||||
|
Tool(name="Generate singing voice From User Input Text", func=self.t2s.inference,
|
||||||
|
description="useful for when you want to generate a piece of singing voice from its description."
|
||||||
|
"The input to this tool should be a comma seperated string of three, representing the text sequence and its corresponding note and duration sequence."
|
||||||
|
"Text sequence consists of Chinese characters (except for SP and AP). "
|
||||||
|
"Each component of the note and duration sequence sequences should be separated by | mark."
|
||||||
|
"It is necessary to ensure that note and duration sequence is of the same length. ")]
|
||||||
self.agent = initialize_agent(
|
self.agent = initialize_agent(
|
||||||
self.tools,
|
self.tools,
|
||||||
self.llm,
|
self.llm,
|
||||||
|
|||||||
Reference in New Issue
Block a user