Update audio-chatgpt.py

2025-12-16 20:07:58 +01:00 · 2023-03-20 15:23:45 +08:00
parent df2e820981
commit 6a8002e5ca
1 changed files with 37 additions and 2 deletions
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -2,6 +2,8 @@ import sys
 import os
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_sing/DiffSinger'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text-to-audio/MakeAnAudio'))
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
 import torch
@@ -29,7 +31,7 @@ from pathlib import Path
 from vocoder.hifigan.modules import VocoderHifigan
 from ldm.models.diffusion.ddim import DDIMSampler
 from wav_evaluation.models.CLAPWrapper import CLAPWrapper
-
+from inference.svs.ds_e2e import DiffSingerE2EInfer
 AUDIO_CHATGPT_PREFIX = """Audio ChatGPT
@@ -224,6 +226,32 @@ class T2A:
        print(f"Processed T2I.run, text: {text}, audio_filename: {audio_filename}")
        return audio_filename
 class T2S:
    def __init__(self, device= None):
        if device is None:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print("Initializing DiffSinger to %s" % device)
        self.device = device
        exp_name = 'text_to_sing/DiffSinger/checkpoints/0831_opencpop_ds1000'
        config= 'text_to_sing/DiffSinger/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml'
        from utils.hparams import set_hparams
        from utils.hparams import hparams as hp
        set_hparams(config= config,exp_name=exp_name, print_hparams=False)
        self.hp = hp
        self.pipe = DiffSingerE2EInfer(self.hp)
    def inference(self, inputs):
        global temp_audio_filename
        key = ['text', 'notes', 'notes_duration']
        val = inputs.split(",")
        inp = {k:v for k,v in zip(key,val)}
        wav = self.pipe.infer_once(inp)
        wav *= 32767
        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
        temp_audio_filename = audio_filename
        soundfile.write(audio_filename, wav.astype(np.int16), self.hp['audio_sample_rate'])
        print(f"Processed T2S.run, text: {val[0]}, notes: {val[1]}, notes duration: {val[2]}, audio_filename: {audio_filename}")
        return temp_audio_filename
 class ConversationBot:
    def __init__(self):
@@ -232,6 +260,7 @@ class ConversationBot:
        self.t2i = T2I(device="cuda:0")
        self.t2a = T2A(device="cuda:0")
        self.t2s = T2S(device="cuda:0")
        self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
        self.tools = [
            Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
@@ -239,7 +268,13 @@ class ConversationBot:
                             "The input to this tool should be a string, representing the text used to generate image. "),
            Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
                 description="useful for when you want to generate an audio from a user input text and it saved it to a file."
-                             "The input to this tool should be a string, representing the text used to generate audio.")]
+                             "The input to this tool should be a string, representing the text used to generate audio."),
            Tool(name="Generate singing voice From User Input Text", func=self.t2s.inference,
                 description="useful for when you want to generate a piece of singing voice from its description."
                             "The input to this tool should be a comma seperated string of three, representing the text sequence and its corresponding note and duration sequence."
                             "Text sequence consists of Chinese characters (except for SP and AP). "
                             "Each component of the note and duration sequence sequences should be separated by | mark."
                             "It is necessary to ensure that note and duration sequence is of the same length. ")]
        self.agent = initialize_agent(
            self.tools,
            self.llm,