In [None]:
%tensorflow_version 1.x
import os
from os.path import join

# Getting the model (PS: this is a famous but not official model)
!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git
project_name = 'Real-Time-Voice-Cloning'

#Install requirements with some utils
!cd {project_name} && pip install -q -r requirements.txt
!apt-get install -qq libportaudio2
!pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip

#downloading pretrained weights
!cd {project_name} && wget https://github.com/blue-fish/Real-Time-Voice-Cloning/releases/download/v1.0/pretrained.zip && unzip -o pretrained.zip

In [None]:
#packages
import sys
sys.path.append(project_name)

from IPython.display import display, Audio, clear_output
from IPython.utils import io
import ipywidgets as widgets
import numpy as np
from dl_colab_notebooks.audio import record_audio, upload_audio

from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path

#loading weights to the models
encoder.load_model(project_name / Path("encoder/saved_models/pretrained.pt"))
synthesizer = Synthesizer(project_name / Path("synthesizer/saved_models/pretrained/pretrained.pt"))
vocoder.load_model(project_name / Path("vocoder/saved_models/pretrained/pretrained.pt"))

In [None]:
#params (changes these as you please :)
record_seconds = 10 # Record duration 1-10 seconds
record_or_upload = "Upload" # fill this with Record (record now) or Upload (upload an audio)


SAMPLE_RATE = 22050
embedding = None
def _compute_embedding(audio):
  display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))
  global embedding
  embedding = None
  embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, SAMPLE_RATE))
def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds, sample_rate=SAMPLE_RATE)
  _compute_embedding(audio)
def _upload_audio(b):
  clear_output()
  audio = upload_audio(sample_rate=SAMPLE_RATE)
  _compute_embedding(audio)

if record_or_upload == "Record":
  button = widgets.Button(description="Record Your Voice")
  button.on_click(_record_audio)
  display(button)
else:
  _upload_audio("")

In [None]:
# text to output ( Please dont use this to generate something that is going to get you in trouble :'( )
text = "Hi people check this out this man Rida is doing something fun" # text to say


if embedding is None:
  print("first record or upload a voice file!")
else:
  print("Synthesizing new audio...")
  #with io.capture_output() as captured:
  specs = synthesizer.synthesize_spectrograms([text], [embedding])
  generated_wav = vocoder.infer_waveform(specs[0])
  generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
  clear_output()
  out_audio = Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=True)
  display(out_audio)

In [None]:
# download output audio file ( Please dont use this to generate something that is going to get you in trouble :'( )
with open('out.wav', 'wb') as f:
    f.write(out_audio.data)