update

2025-12-15 19:37:52 +01:00 · 2023-04-30 23:16:22 +08:00
parent 97a9a2fdc8
commit 9b6d51d192
3 changed files with 15 additions and 13 deletions
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -4,8 +4,6 @@ sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
-sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
-sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
 import gradio as gr
@@ -217,7 +215,7 @@ class I2A:
    def __init__(self, device):
        print("Initializing Make-An-Audio-Image to %s" % device)
        self.device = device
-        self.sampler = self._initialize_model('text_to_audio/Make_An_Audio_img/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio_img/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
+        self.sampler = self._initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
        self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio_img/vocoder/logs/bigv16k53w',device=device)

    def _initialize_model(self, config, ckpt, device):
@@ -421,8 +419,8 @@ class Inpaint:
    def __init__(self, device):
        print("Initializing Make-An-Audio-inpaint to %s" % device)
        self.device = device
-        self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
-        self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
+        self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/inpaint7_epoch00047.ckpt')
+        self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53',device=device)
        self.cmap_transform = matplotlib.cm.viridis

    def _initialize_model_inpaint(self, config, ckpt):
--- a/download.sh
+++ b/download.sh
@@ -8,14 +8,10 @@ wget -P checkpoints/0109_hifigan_bigpopcs_hop128/ -i https://huggingface.co/spac
 wget -P checkpoints/0102_xiaoma_pe/ -i https://huggingface.co/spaces/Silentlin/DiffSinger/blob/main/checkpoints/0102_xiaoma_pe/config.yaml https://huggingface.co/spaces/Silentlin/DiffSinger/resolve/main/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
 #  Text to audio
 cd text_to_audio
-git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio
-git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img
-git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint
 wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/ta40multi_epoch=000085.ckpt
 wget -P text_to_audio/Make_An_Audio/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/CLAP/CLAP_weights_2022.pth
-wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt
-wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/blob/main/useful_ckpts/CLAP/CLAP_weights_2022.pth
-wget -P text_to_audio/Make_An_Audio_inpaint/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt
+wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt
+wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt
 #  Text to speech
 wget -P checkpoints/GenerSpeech/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/GenerSpeech/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
 wget -P checkpoints/trainset_hifigan/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/trainset_hifigan/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu113
 accelerate
 addict==2.4.0
+aiofiles
 albumentations==1.3.0
 appdirs==1.4.4
 basicsr==1.4.2
@@ -10,17 +11,23 @@ diffusers
 einops==0.3.0
 espnet
 espnet_model_zoo
+ffmpeg-python
 g2p-en==2.1.0
 google==3.0.0
 gradio
-h5py==2.8.0
+h5py
 imageio==2.9.0
 imageio-ffmpeg==0.4.2
 invisible-watermark>=0.1.5
+jieba
 kornia==0.6
 langchain==0.0.101
 librosa
+loguru
 miditoolkit==0.1.7
+mmcv==1.5.0
+mmdet==2.23.0 
+mmengine==0.7.2
 moviepy==1.0.3
 numpy==1.23.1
 omegaconf==2.1.1
@@ -56,8 +63,9 @@ torchlibrosa
 torchmetrics==0.6.0
 torchvision==0.13.1
 transformers==4.26.1
-typing-extensions==3.10.0.2
+typing-extensions==4.0.0
 uuid==1.30
 webdataset==0.2.5
 webrtcvad==2.0.10
 yapf==0.32.0
+git+https://github.com/openai/CLIP.git