voice-cloning-collab/vocoder/inference.py

from vocoder.models.fatchord_version import WaveRNN
from vocoder import hparams as hp
from scipy.fft import rfft, rfftfreq
import torch
import numpy as np
import noisereduce as nr


_model = None   # type: WaveRNN

def load_model(weights_fpath, verbose=True):
    global _model, _device

    if verbose:
        print("Building Wave-RNN")
    _model = WaveRNN(
        rnn_dims=hp.voc_rnn_dims,
        fc_dims=hp.voc_fc_dims,
        bits=hp.bits,
        pad=hp.voc_pad,
        upsample_factors=hp.voc_upsample_factors,
        feat_dims=hp.num_mels,
        compute_dims=hp.voc_compute_dims,
        res_out_dims=hp.voc_res_out_dims,
        res_blocks=hp.voc_res_blocks,
        hop_length=hp.hop_length,
        sample_rate=hp.sample_rate,
        mode=hp.voc_mode
    )

    if torch.cuda.is_available():
        _model = _model.cuda()
        _device = torch.device('cuda')
    else:
        _device = torch.device('cpu')

    if verbose:
        print("Loading model weights at %s" % weights_fpath)
    checkpoint = torch.load(weights_fpath, _device)
    _model.load_state_dict(checkpoint['model_state'])
    _model.eval()


def is_loaded():
    return _model is not None


def infer_waveform(mel, normalize=True,  batched=True, target=8000, overlap=800,
                   progress_callback=None):
    """
    Infers the waveform of a mel spectrogram output by the synthesizer (the format must match
    that of the synthesizer!)

    :param normalize:
    :param batched:
    :param target:
    :param overlap:
    :return:
    """
    if _model is None:
        raise Exception("Please load Wave-RNN in memory before using it")

    if normalize:
        mel = mel / hp.mel_max_abs_value
    mel = torch.from_numpy(mel[None, ...])
    wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
    return wav

def waveform_denoising(wav):
    fft_max_freq = get_dominant_freq(wav)
    prop_decrease = hp.prop_decrease_low_freq if fft_max_freq < hp.split_freq else hp.prop_decrease_high_freq
    # prop_decrease = 0.6 for low freq audio
    # prop_decrease = 0.9 for high freq audio
    print(f"\nthe dominant frequency of output audio is {fft_max_freq}Hz")
    return nr.reduce_noise(wav, hp.sample_rate, prop_decrease=prop_decrease)

def get_dominant_freq(wav):
    N = len(wav)
    fft_wav = rfft(wav)
    fft_freq = rfftfreq(N, 1 / hp.sample_rate)
    fft_max = max(fft_wav)
    fft_max_index = np.where(fft_wav == fft_max)[0][0]
    fft_max_freq = fft_freq[fft_max_index]
    # plt.plot(fft_freq, fft_wav)
    # plt.clf()
    # plt.savefig(f"{speaker_name}.png", dpi=300)
    return fft_max_freq