mirror of
https://github.com/AIGC-Audio/AudioGPT.git
synced 2025-12-16 11:57:58 +01:00
@@ -4,8 +4,6 @@ sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
|||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
||||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
|
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
|
||||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
|
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
|
||||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
|
|
||||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint'))
|
|
||||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
|
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
|
||||||
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
|
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
@@ -186,7 +184,7 @@ class T2A:
|
|||||||
|
|
||||||
def select_best_audio(self, prompt, wav_list):
|
def select_best_audio(self, prompt, wav_list):
|
||||||
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
|
||||||
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth', 'useful_ckpts/CLAP/config.yml',
|
clap_model = CLAPWrapper('text_to_audio/Make_An_Audio/useful_ckpts/CLAP/CLAP_weights_2022.pth', 'text_to_audio/Make_An_Audio/useful_ckpts/CLAP/config.yml',
|
||||||
use_cuda=torch.cuda.is_available())
|
use_cuda=torch.cuda.is_available())
|
||||||
text_embeddings = clap_model.get_text_embeddings([prompt])
|
text_embeddings = clap_model.get_text_embeddings([prompt])
|
||||||
score_list = []
|
score_list = []
|
||||||
@@ -217,8 +215,8 @@ class I2A:
|
|||||||
def __init__(self, device):
|
def __init__(self, device):
|
||||||
print("Initializing Make-An-Audio-Image to %s" % device)
|
print("Initializing Make-An-Audio-Image to %s" % device)
|
||||||
self.device = device
|
self.device = device
|
||||||
self.sampler = self._initialize_model('text_to_audio/Make_An_Audio_img/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio_img/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
|
self.sampler = self._initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
|
||||||
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio_img/vocoder/logs/bigv16k53w',device=device)
|
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
||||||
|
|
||||||
def _initialize_model(self, config, ckpt, device):
|
def _initialize_model(self, config, ckpt, device):
|
||||||
config = OmegaConf.load(config)
|
config = OmegaConf.load(config)
|
||||||
@@ -421,8 +419,8 @@ class Inpaint:
|
|||||||
def __init__(self, device):
|
def __init__(self, device):
|
||||||
print("Initializing Make-An-Audio-inpaint to %s" % device)
|
print("Initializing Make-An-Audio-inpaint to %s" % device)
|
||||||
self.device = device
|
self.device = device
|
||||||
self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
|
self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/inpaint7_epoch00047.ckpt')
|
||||||
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
|
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
|
||||||
self.cmap_transform = matplotlib.cm.viridis
|
self.cmap_transform = matplotlib.cm.viridis
|
||||||
|
|
||||||
def _initialize_model_inpaint(self, config, ckpt):
|
def _initialize_model_inpaint(self, config, ckpt):
|
||||||
|
|||||||
@@ -8,14 +8,10 @@ wget -P checkpoints/0109_hifigan_bigpopcs_hop128/ -i https://huggingface.co/spac
|
|||||||
wget -P checkpoints/0102_xiaoma_pe/ -i https://huggingface.co/spaces/Silentlin/DiffSinger/blob/main/checkpoints/0102_xiaoma_pe/config.yaml https://huggingface.co/spaces/Silentlin/DiffSinger/resolve/main/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
|
wget -P checkpoints/0102_xiaoma_pe/ -i https://huggingface.co/spaces/Silentlin/DiffSinger/blob/main/checkpoints/0102_xiaoma_pe/config.yaml https://huggingface.co/spaces/Silentlin/DiffSinger/resolve/main/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
|
||||||
# Text to audio
|
# Text to audio
|
||||||
cd text_to_audio
|
cd text_to_audio
|
||||||
git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio
|
|
||||||
git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img
|
|
||||||
git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint
|
|
||||||
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/ta40multi_epoch=000085.ckpt
|
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/ta40multi_epoch=000085.ckpt
|
||||||
wget -P text_to_audio/Make_An_Audio/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/CLAP/CLAP_weights_2022.pth
|
wget -P text_to_audio/Make_An_Audio/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/CLAP/CLAP_weights_2022.pth
|
||||||
wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt
|
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt
|
||||||
wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/blob/main/useful_ckpts/CLAP/CLAP_weights_2022.pth
|
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt
|
||||||
wget -P text_to_audio/Make_An_Audio_inpaint/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt
|
|
||||||
# Text to speech
|
# Text to speech
|
||||||
wget -P checkpoints/GenerSpeech/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/GenerSpeech/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
|
wget -P checkpoints/GenerSpeech/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/GenerSpeech/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
|
||||||
wget -P checkpoints/trainset_hifigan/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/trainset_hifigan/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt
|
wget -P checkpoints/trainset_hifigan/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/trainset_hifigan/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cu113
|
--extra-index-url https://download.pytorch.org/whl/cu113
|
||||||
accelerate
|
accelerate
|
||||||
addict==2.4.0
|
addict==2.4.0
|
||||||
|
aiofiles
|
||||||
albumentations==1.3.0
|
albumentations==1.3.0
|
||||||
appdirs==1.4.4
|
appdirs==1.4.4
|
||||||
basicsr==1.4.2
|
basicsr==1.4.2
|
||||||
@@ -10,17 +11,23 @@ diffusers
|
|||||||
einops==0.3.0
|
einops==0.3.0
|
||||||
espnet
|
espnet
|
||||||
espnet_model_zoo
|
espnet_model_zoo
|
||||||
|
ffmpeg-python
|
||||||
g2p-en==2.1.0
|
g2p-en==2.1.0
|
||||||
google==3.0.0
|
google==3.0.0
|
||||||
gradio
|
gradio
|
||||||
h5py==2.8.0
|
h5py
|
||||||
imageio==2.9.0
|
imageio==2.9.0
|
||||||
imageio-ffmpeg==0.4.2
|
imageio-ffmpeg==0.4.2
|
||||||
invisible-watermark>=0.1.5
|
invisible-watermark>=0.1.5
|
||||||
|
jieba
|
||||||
kornia==0.6
|
kornia==0.6
|
||||||
langchain==0.0.101
|
langchain==0.0.101
|
||||||
librosa
|
librosa
|
||||||
|
loguru
|
||||||
miditoolkit==0.1.7
|
miditoolkit==0.1.7
|
||||||
|
mmcv==1.5.0
|
||||||
|
mmdet==2.23.0
|
||||||
|
mmengine==0.7.2
|
||||||
moviepy==1.0.3
|
moviepy==1.0.3
|
||||||
numpy==1.23.1
|
numpy==1.23.1
|
||||||
omegaconf==2.1.1
|
omegaconf==2.1.1
|
||||||
@@ -56,8 +63,9 @@ torchlibrosa
|
|||||||
torchmetrics==0.6.0
|
torchmetrics==0.6.0
|
||||||
torchvision==0.13.1
|
torchvision==0.13.1
|
||||||
transformers==4.26.1
|
transformers==4.26.1
|
||||||
typing-extensions==3.10.0.2
|
typing-extensions==4.0.0
|
||||||
uuid==1.30
|
uuid==1.30
|
||||||
webdataset==0.2.5
|
webdataset==0.2.5
|
||||||
webrtcvad==2.0.10
|
webrtcvad==2.0.10
|
||||||
yapf==0.32.0
|
yapf==0.32.0
|
||||||
|
git+https://github.com/openai/CLIP.git
|
||||||
@@ -0,0 +1,77 @@
|
|||||||
|
model:
|
||||||
|
base_learning_rate: 1.0e-05
|
||||||
|
target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
|
||||||
|
params:
|
||||||
|
linear_start: 0.00085
|
||||||
|
linear_end: 0.0120
|
||||||
|
num_timesteps_cond: 1
|
||||||
|
log_every_t: 200
|
||||||
|
timesteps: 1000
|
||||||
|
first_stage_key: image
|
||||||
|
cond_stage_key: caption
|
||||||
|
image_size: 32 # unused
|
||||||
|
mel_dim: 10 # 80 // 2^3
|
||||||
|
mel_length: 78 # 624 // 2^3
|
||||||
|
channels: 4
|
||||||
|
cond_stage_trainable: false
|
||||||
|
conditioning_key: crossattn
|
||||||
|
monitor: val/loss_simple_ema
|
||||||
|
scale_by_std: True
|
||||||
|
use_ema: False
|
||||||
|
|
||||||
|
scheduler_config: # 10000 warmup steps
|
||||||
|
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||||
|
params:
|
||||||
|
warm_up_steps: [10000]
|
||||||
|
cycle_lengths: [10000000000000]
|
||||||
|
f_start: [1.e-6]
|
||||||
|
f_max: [1.]
|
||||||
|
f_min: [ 1.]
|
||||||
|
|
||||||
|
unet_config:
|
||||||
|
target: ldm.modules.diffusionmodules.custom_openaimodel.UNetModel
|
||||||
|
params:
|
||||||
|
image_size: 32 # ununsed
|
||||||
|
in_channels: 4
|
||||||
|
out_channels: 4
|
||||||
|
model_channels: 256
|
||||||
|
attention_resolutions:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: # num_down = len(ch_mult)-1
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
num_head_channels: 32
|
||||||
|
use_spatial_transformer: true
|
||||||
|
transformer_depth: 1
|
||||||
|
context_dim: 1024
|
||||||
|
use_context_project: false
|
||||||
|
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: ldm.models.autoencoder.AutoencoderKL
|
||||||
|
params:
|
||||||
|
embed_dim: 4
|
||||||
|
monitor: val/rec_loss
|
||||||
|
ddconfig:
|
||||||
|
double_z: true
|
||||||
|
z_channels: 4
|
||||||
|
resolution: 848
|
||||||
|
in_channels: 1
|
||||||
|
out_ch: 1
|
||||||
|
ch: 128
|
||||||
|
ch_mult: [ 1, 2, 2, 4 ] # num_down = len(ch_mult)-1
|
||||||
|
num_res_blocks: 2
|
||||||
|
attn_resolutions: [106, 212]
|
||||||
|
dropout: 0.0
|
||||||
|
lossconfig:
|
||||||
|
target: torch.nn.Identity
|
||||||
|
|
||||||
|
cond_stage_config:
|
||||||
|
target: ldm.modules.encoders.modules.FrozenGlobalNormOpenCLIPEmbedder
|
||||||
|
params:
|
||||||
|
freeze: True
|
||||||
|
delvisual: False
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,68 @@
|
|||||||
|
model:
|
||||||
|
base_learning_rate: 1.0e-05
|
||||||
|
target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
|
||||||
|
params:
|
||||||
|
linear_start: 0.0015
|
||||||
|
linear_end: 0.0205
|
||||||
|
log_every_t: 100
|
||||||
|
timesteps: 1000
|
||||||
|
loss_type: l1
|
||||||
|
first_stage_key: image
|
||||||
|
cond_stage_key: masked_image
|
||||||
|
image_size: 32 # unused
|
||||||
|
mel_dim: 10 # 80 // 2^3
|
||||||
|
mel_length: 106 # 848 // 2^3
|
||||||
|
channels: 4
|
||||||
|
concat_mode: true
|
||||||
|
monitor: val/loss
|
||||||
|
use_ema: False
|
||||||
|
|
||||||
|
scheduler_config:
|
||||||
|
target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
|
||||||
|
params:
|
||||||
|
verbosity_interval: 0
|
||||||
|
warm_up_steps: 1000
|
||||||
|
max_decay_steps: 50000
|
||||||
|
lr_start: 0.001
|
||||||
|
lr_max: 0.1
|
||||||
|
lr_min: 0.0001
|
||||||
|
|
||||||
|
unet_config:
|
||||||
|
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||||
|
params:
|
||||||
|
image_size: 32 # ununsed
|
||||||
|
in_channels: 9 # 4 + 1 + 4
|
||||||
|
out_channels: 4
|
||||||
|
model_channels: 320
|
||||||
|
attention_resolutions:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: # num_down = len(ch_mult)-1
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
num_heads: 8
|
||||||
|
resblock_updown: true
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: ldm.models.autoencoder.AutoencoderKL
|
||||||
|
params:
|
||||||
|
embed_dim: 4
|
||||||
|
monitor: val/rec_loss
|
||||||
|
ckpt_path: # /apdcephfs/share_1316500/nlphuang/results/Text_to_audio/ae15/2022-12-15T22-24-00_mixdata_kl_4_tile/epoch=000009-v2.ckpt
|
||||||
|
ddconfig:
|
||||||
|
double_z: true
|
||||||
|
z_channels: 4
|
||||||
|
resolution: 848
|
||||||
|
in_channels: 1
|
||||||
|
out_ch: 1
|
||||||
|
ch: 128
|
||||||
|
ch_mult: [ 1, 2, 2, 4 ] # num_down = len(ch_mult)-1
|
||||||
|
num_res_blocks: 2
|
||||||
|
attn_resolutions: [106, 212]
|
||||||
|
dropout: 0.0
|
||||||
|
lossconfig:
|
||||||
|
target: torch.nn.Identity
|
||||||
|
|
||||||
|
cond_stage_config: __is_first_stage__
|
||||||
|
|
||||||
151
text_to_audio/Make_An_Audio/ldm/data/extract_mel_spectrogram.py
Normal file
151
text_to_audio/Make_An_Audio/ldm/data/extract_mel_spectrogram.py
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import os.path as P
|
||||||
|
from copy import deepcopy
|
||||||
|
from functools import partial
|
||||||
|
from glob import glob
|
||||||
|
from multiprocessing import Pool
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
import torchvision
|
||||||
|
|
||||||
|
|
||||||
|
class MelSpectrogram(object):
|
||||||
|
def __init__(self, sr, nfft, fmin, fmax, nmels, hoplen, spec_power, inverse=False):
|
||||||
|
self.sr = sr
|
||||||
|
self.nfft = nfft
|
||||||
|
self.fmin = fmin
|
||||||
|
self.fmax = fmax
|
||||||
|
self.nmels = nmels
|
||||||
|
self.hoplen = hoplen
|
||||||
|
self.spec_power = spec_power
|
||||||
|
self.inverse = inverse
|
||||||
|
|
||||||
|
self.mel_basis = librosa.filters.mel(sr=sr, n_fft=nfft, fmin=fmin, fmax=fmax, n_mels=nmels)
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
spec = librosa.feature.inverse.mel_to_stft(
|
||||||
|
x, sr=self.sr, n_fft=self.nfft, fmin=self.fmin, fmax=self.fmax, power=self.spec_power
|
||||||
|
)
|
||||||
|
wav = librosa.griffinlim(spec, hop_length=self.hoplen)
|
||||||
|
return wav
|
||||||
|
else:
|
||||||
|
spec = np.abs(librosa.stft(x, n_fft=self.nfft, hop_length=self.hoplen)) ** self.spec_power
|
||||||
|
mel_spec = np.dot(self.mel_basis, spec)
|
||||||
|
return mel_spec
|
||||||
|
|
||||||
|
class LowerThresh(object):
|
||||||
|
def __init__(self, min_val, inverse=False):
|
||||||
|
self.min_val = min_val
|
||||||
|
self.inverse = inverse
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
return np.maximum(self.min_val, x)
|
||||||
|
|
||||||
|
class Add(object):
|
||||||
|
def __init__(self, val, inverse=False):
|
||||||
|
self.inverse = inverse
|
||||||
|
self.val = val
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
return x - self.val
|
||||||
|
else:
|
||||||
|
return x + self.val
|
||||||
|
|
||||||
|
class Subtract(Add):
|
||||||
|
def __init__(self, val, inverse=False):
|
||||||
|
self.inverse = inverse
|
||||||
|
self.val = val
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
return x + self.val
|
||||||
|
else:
|
||||||
|
return x - self.val
|
||||||
|
|
||||||
|
class Multiply(object):
|
||||||
|
def __init__(self, val, inverse=False) -> None:
|
||||||
|
self.val = val
|
||||||
|
self.inverse = inverse
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
return x / self.val
|
||||||
|
else:
|
||||||
|
return x * self.val
|
||||||
|
|
||||||
|
class Divide(Multiply):
|
||||||
|
def __init__(self, val, inverse=False):
|
||||||
|
self.inverse = inverse
|
||||||
|
self.val = val
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
return x * self.val
|
||||||
|
else:
|
||||||
|
return x / self.val
|
||||||
|
|
||||||
|
class Log10(object):
|
||||||
|
def __init__(self, inverse=False):
|
||||||
|
self.inverse = inverse
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
return 10 ** x
|
||||||
|
else:
|
||||||
|
return np.log10(x)
|
||||||
|
|
||||||
|
class Clip(object):
|
||||||
|
def __init__(self, min_val, max_val, inverse=False):
|
||||||
|
self.min_val = min_val
|
||||||
|
self.max_val = max_val
|
||||||
|
self.inverse = inverse
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
return np.clip(x, self.min_val, self.max_val)
|
||||||
|
|
||||||
|
class TrimSpec(object):
|
||||||
|
def __init__(self, max_len, inverse=False):
|
||||||
|
self.max_len = max_len
|
||||||
|
self.inverse = inverse
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
return x[:, :self.max_len]
|
||||||
|
|
||||||
|
class MaxNorm(object):
|
||||||
|
def __init__(self, inverse=False):
|
||||||
|
self.inverse = inverse
|
||||||
|
self.eps = 1e-10
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
if self.inverse:
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
return x / (x.max() + self.eps)
|
||||||
|
|
||||||
|
|
||||||
|
TRANSFORMS_16000 = torchvision.transforms.Compose([
|
||||||
|
MelSpectrogram(sr=16000, nfft=1024, fmin=125, fmax=7600, nmels=80, hoplen=1024//4, spec_power=1),
|
||||||
|
LowerThresh(1e-5),
|
||||||
|
Log10(),
|
||||||
|
Multiply(20),
|
||||||
|
Subtract(20),
|
||||||
|
Add(100),
|
||||||
|
Divide(100),
|
||||||
|
Clip(0, 1.0)
|
||||||
|
# TrimSpec(860)
|
||||||
|
])
|
||||||
|
|
||||||
@@ -9,7 +9,7 @@ from importlib_resources import files
|
|||||||
from ldm.modules.encoders.CLAP.utils import read_config_as_args
|
from ldm.modules.encoders.CLAP.utils import read_config_as_args
|
||||||
from ldm.modules.encoders.CLAP.clap import TextEncoder
|
from ldm.modules.encoders.CLAP.clap import TextEncoder
|
||||||
from ldm.util import default, count_params
|
from ldm.util import default, count_params
|
||||||
|
import open_clip
|
||||||
|
|
||||||
class AbstractEncoder(nn.Module):
|
class AbstractEncoder(nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -310,5 +310,41 @@ class FrozenFLANEmbedder(AbstractEncoder):
|
|||||||
z = outputs.last_hidden_state
|
z = outputs.last_hidden_state
|
||||||
return z
|
return z
|
||||||
|
|
||||||
|
def encode(self, text):
|
||||||
|
return self(text)
|
||||||
|
class FrozenGlobalNormOpenCLIPEmbedder(AbstractEncoder):
|
||||||
|
"""
|
||||||
|
Uses the OpenCLIP transformer encoder for text
|
||||||
|
"""
|
||||||
|
def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", freeze=True, delvisual=True):
|
||||||
|
super().__init__()
|
||||||
|
model, _, preprocess = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
|
||||||
|
if delvisual:
|
||||||
|
del model.visual
|
||||||
|
del preprocess
|
||||||
|
else:
|
||||||
|
self.preprocess = preprocess
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
self.device = device
|
||||||
|
if freeze:
|
||||||
|
self.freeze()
|
||||||
|
|
||||||
|
def freeze(self):
|
||||||
|
self.model = self.model.eval()
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, text):
|
||||||
|
tokens = open_clip.tokenize(text)
|
||||||
|
z = self.model.encode_text(tokens.to(self.device))
|
||||||
|
z /= z.norm(dim=-1, keepdim=True)
|
||||||
|
return z.unsqueeze(1)
|
||||||
|
|
||||||
|
def forward_img(self, image):
|
||||||
|
z = self.model.encode_image(image.to(self.device))
|
||||||
|
z /= z.norm(dim=-1, keepdim=True)
|
||||||
|
return z.unsqueeze(1)
|
||||||
|
|
||||||
def encode(self, text):
|
def encode(self, text):
|
||||||
return self(text)
|
return self(text)
|
||||||
|
Before Width: | Height: | Size: 431 KiB After Width: | Height: | Size: 431 KiB |
|
Can't render this file because it is too large.
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user