From 9b6d51d192f3cda270b44b37e5864727576854cd Mon Sep 17 00:00:00 2001 From: lmz <1352359183@qq.com> Date: Sun, 30 Apr 2023 23:16:22 +0800 Subject: [PATCH] update --- audio-chatgpt.py | 8 +++----- download.sh | 8 ++------ requirements.txt | 12 ++++++++++-- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/audio-chatgpt.py b/audio-chatgpt.py index 7a45383..faaa4b1 100644 --- a/audio-chatgpt.py +++ b/audio-chatgpt.py @@ -4,8 +4,6 @@ sys.path.append(os.path.dirname(os.path.realpath(__file__))) sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq')) sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio')) -sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img')) -sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint')) sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection')) sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural')) import gradio as gr @@ -217,7 +215,7 @@ class I2A: def __init__(self, device): print("Initializing Make-An-Audio-Image to %s" % device) self.device = device - self.sampler = self._initialize_model('text_to_audio/Make_An_Audio_img/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio_img/useful_ckpts/ta54_epoch=000216.ckpt', device=device) + self.sampler = self._initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device) self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio_img/vocoder/logs/bigv16k53w',device=device) def _initialize_model(self, config, ckpt, device): @@ -421,8 +419,8 @@ class Inpaint: def __init__(self, device): print("Initializing Make-An-Audio-inpaint to %s" % device) self.device = device - self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt') - self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device) + self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/inpaint7_epoch00047.ckpt') + self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53',device=device) self.cmap_transform = matplotlib.cm.viridis def _initialize_model_inpaint(self, config, ckpt): diff --git a/download.sh b/download.sh index dc0a627..c23cc9f 100644 --- a/download.sh +++ b/download.sh @@ -8,14 +8,10 @@ wget -P checkpoints/0109_hifigan_bigpopcs_hop128/ -i https://huggingface.co/spac wget -P checkpoints/0102_xiaoma_pe/ -i https://huggingface.co/spaces/Silentlin/DiffSinger/blob/main/checkpoints/0102_xiaoma_pe/config.yaml https://huggingface.co/spaces/Silentlin/DiffSinger/resolve/main/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt # Text to audio cd text_to_audio -git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio -git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img -git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/ta40multi_epoch=000085.ckpt wget -P text_to_audio/Make_An_Audio/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/CLAP/CLAP_weights_2022.pth -wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt -wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/blob/main/useful_ckpts/CLAP/CLAP_weights_2022.pth -wget -P text_to_audio/Make_An_Audio_inpaint/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt +wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt +wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt # Text to speech wget -P checkpoints/GenerSpeech/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/GenerSpeech/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt wget -P checkpoints/trainset_hifigan/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/trainset_hifigan/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt diff --git a/requirements.txt b/requirements.txt index f884d53..e5a8d4c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ --extra-index-url https://download.pytorch.org/whl/cu113 accelerate addict==2.4.0 +aiofiles albumentations==1.3.0 appdirs==1.4.4 basicsr==1.4.2 @@ -10,17 +11,23 @@ diffusers einops==0.3.0 espnet espnet_model_zoo +ffmpeg-python g2p-en==2.1.0 google==3.0.0 gradio -h5py==2.8.0 +h5py imageio==2.9.0 imageio-ffmpeg==0.4.2 invisible-watermark>=0.1.5 +jieba kornia==0.6 langchain==0.0.101 librosa +loguru miditoolkit==0.1.7 +mmcv==1.5.0 +mmdet==2.23.0 +mmengine==0.7.2 moviepy==1.0.3 numpy==1.23.1 omegaconf==2.1.1 @@ -56,8 +63,9 @@ torchlibrosa torchmetrics==0.6.0 torchvision==0.13.1 transformers==4.26.1 -typing-extensions==3.10.0.2 +typing-extensions==4.0.0 uuid==1.30 webdataset==0.2.5 webrtcvad==2.0.10 yapf==0.32.0 +git+https://github.com/openai/CLIP.git \ No newline at end of file