This commit is contained in:
lmz
2023-04-30 23:16:22 +08:00
parent 97a9a2fdc8
commit 9b6d51d192
3 changed files with 15 additions and 13 deletions

View File

@@ -4,8 +4,6 @@ sys.path.append(os.path.dirname(os.path.realpath(__file__)))
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
import gradio as gr
@@ -217,7 +215,7 @@ class I2A:
def __init__(self, device):
print("Initializing Make-An-Audio-Image to %s" % device)
self.device = device
self.sampler = self._initialize_model('text_to_audio/Make_An_Audio_img/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio_img/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
self.sampler = self._initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio_img/vocoder/logs/bigv16k53w',device=device)
def _initialize_model(self, config, ckpt, device):
@@ -421,8 +419,8 @@ class Inpaint:
def __init__(self, device):
print("Initializing Make-An-Audio-inpaint to %s" % device)
self.device = device
self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/inpaint7_epoch00047.ckpt')
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53',device=device)
self.cmap_transform = matplotlib.cm.viridis
def _initialize_model_inpaint(self, config, ckpt):

View File

@@ -8,14 +8,10 @@ wget -P checkpoints/0109_hifigan_bigpopcs_hop128/ -i https://huggingface.co/spac
wget -P checkpoints/0102_xiaoma_pe/ -i https://huggingface.co/spaces/Silentlin/DiffSinger/blob/main/checkpoints/0102_xiaoma_pe/config.yaml https://huggingface.co/spaces/Silentlin/DiffSinger/resolve/main/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
# Text to audio
cd text_to_audio
git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio
git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img
git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/ta40multi_epoch=000085.ckpt
wget -P text_to_audio/Make_An_Audio/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/CLAP/CLAP_weights_2022.pth
wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt
wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/blob/main/useful_ckpts/CLAP/CLAP_weights_2022.pth
wget -P text_to_audio/Make_An_Audio_inpaint/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt
# Text to speech
wget -P checkpoints/GenerSpeech/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/GenerSpeech/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
wget -P checkpoints/trainset_hifigan/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/trainset_hifigan/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt

View File

@@ -1,6 +1,7 @@
--extra-index-url https://download.pytorch.org/whl/cu113
accelerate
addict==2.4.0
aiofiles
albumentations==1.3.0
appdirs==1.4.4
basicsr==1.4.2
@@ -10,17 +11,23 @@ diffusers
einops==0.3.0
espnet
espnet_model_zoo
ffmpeg-python
g2p-en==2.1.0
google==3.0.0
gradio
h5py==2.8.0
h5py
imageio==2.9.0
imageio-ffmpeg==0.4.2
invisible-watermark>=0.1.5
jieba
kornia==0.6
langchain==0.0.101
librosa
loguru
miditoolkit==0.1.7
mmcv==1.5.0
mmdet==2.23.0
mmengine==0.7.2
moviepy==1.0.3
numpy==1.23.1
omegaconf==2.1.1
@@ -56,8 +63,9 @@ torchlibrosa
torchmetrics==0.6.0
torchvision==0.13.1
transformers==4.26.1
typing-extensions==3.10.0.2
typing-extensions==4.0.0
uuid==1.30
webdataset==0.2.5
webrtcvad==2.0.10
yapf==0.32.0
git+https://github.com/openai/CLIP.git