mirror of
https://github.com/kingridda/voice-cloning-AI.git
synced 2025-12-15 19:27:58 +01:00
150 lines
5.2 KiB
Plaintext
150 lines
5.2 KiB
Plaintext
|
|
{
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 0,
|
||
|
|
"metadata": {
|
||
|
|
"colab": {
|
||
|
|
"name": "voice_generator_AI.ipynb",
|
||
|
|
"provenance": [],
|
||
|
|
"collapsed_sections": []
|
||
|
|
},
|
||
|
|
"kernelspec": {
|
||
|
|
"name": "python3",
|
||
|
|
"display_name": "Python 3"
|
||
|
|
},
|
||
|
|
"language_info": {
|
||
|
|
"name": "python"
|
||
|
|
},
|
||
|
|
"accelerator": "GPU"
|
||
|
|
},
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"metadata": {
|
||
|
|
"id": "lbdT4692nk7G"
|
||
|
|
},
|
||
|
|
"source": [
|
||
|
|
"%tensorflow_version 1.x\n",
|
||
|
|
"import os\n",
|
||
|
|
"from os.path import join\n",
|
||
|
|
"\n",
|
||
|
|
"# Getting the model (PS: this is a famous but not official model)\n",
|
||
|
|
"!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n",
|
||
|
|
"project_name = 'Real-Time-Voice-Cloning'\n",
|
||
|
|
"\n",
|
||
|
|
"#Install requirements with some utils\n",
|
||
|
|
"!cd {project_name} && pip install -q -r requirements.txt\n",
|
||
|
|
"!apt-get install -qq libportaudio2\n",
|
||
|
|
"!pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip\n",
|
||
|
|
"\n",
|
||
|
|
"#downloading pretrained weights\n",
|
||
|
|
"!cd {project_name} && wget https://github.com/blue-fish/Real-Time-Voice-Cloning/releases/download/v1.0/pretrained.zip && unzip -o pretrained.zip"
|
||
|
|
],
|
||
|
|
"execution_count": null,
|
||
|
|
"outputs": []
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"metadata": {
|
||
|
|
"id": "34S28kEKnx3t"
|
||
|
|
},
|
||
|
|
"source": [
|
||
|
|
"#packages\n",
|
||
|
|
"import sys\n",
|
||
|
|
"sys.path.append(project_name)\n",
|
||
|
|
"\n",
|
||
|
|
"from IPython.display import display, Audio, clear_output\n",
|
||
|
|
"from IPython.utils import io\n",
|
||
|
|
"import ipywidgets as widgets\n",
|
||
|
|
"import numpy as np\n",
|
||
|
|
"from dl_colab_notebooks.audio import record_audio, upload_audio\n",
|
||
|
|
"\n",
|
||
|
|
"from synthesizer.inference import Synthesizer\n",
|
||
|
|
"from encoder import inference as encoder\n",
|
||
|
|
"from vocoder import inference as vocoder\n",
|
||
|
|
"from pathlib import Path\n",
|
||
|
|
"\n",
|
||
|
|
"#loading weights to the models\n",
|
||
|
|
"encoder.load_model(project_name / Path(\"encoder/saved_models/pretrained.pt\"))\n",
|
||
|
|
"synthesizer = Synthesizer(project_name / Path(\"synthesizer/saved_models/pretrained/pretrained.pt\"))\n",
|
||
|
|
"vocoder.load_model(project_name / Path(\"vocoder/saved_models/pretrained/pretrained.pt\"))"
|
||
|
|
],
|
||
|
|
"execution_count": null,
|
||
|
|
"outputs": []
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"metadata": {
|
||
|
|
"id": "YYfgVfeX3zkU"
|
||
|
|
},
|
||
|
|
"source": [
|
||
|
|
"#params (changes these as you please :)\n",
|
||
|
|
"record_seconds = 10 # Record duration 1-10 seconds\n",
|
||
|
|
"record_or_upload = \"Upload\" # fill this with Record (record now) or Upload (upload an audio)\n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"SAMPLE_RATE = 22050\n",
|
||
|
|
"embedding = None\n",
|
||
|
|
"def _compute_embedding(audio):\n",
|
||
|
|
" display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))\n",
|
||
|
|
" global embedding\n",
|
||
|
|
" embedding = None\n",
|
||
|
|
" embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, SAMPLE_RATE))\n",
|
||
|
|
"def _record_audio(b):\n",
|
||
|
|
" clear_output()\n",
|
||
|
|
" audio = record_audio(record_seconds, sample_rate=SAMPLE_RATE)\n",
|
||
|
|
" _compute_embedding(audio)\n",
|
||
|
|
"def _upload_audio(b):\n",
|
||
|
|
" clear_output()\n",
|
||
|
|
" audio = upload_audio(sample_rate=SAMPLE_RATE)\n",
|
||
|
|
" _compute_embedding(audio)\n",
|
||
|
|
"\n",
|
||
|
|
"if record_or_upload == \"Record\":\n",
|
||
|
|
" button = widgets.Button(description=\"Record Your Voice\")\n",
|
||
|
|
" button.on_click(_record_audio)\n",
|
||
|
|
" display(button)\n",
|
||
|
|
"else:\n",
|
||
|
|
" _upload_audio(\"\")"
|
||
|
|
],
|
||
|
|
"execution_count": null,
|
||
|
|
"outputs": []
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"metadata": {
|
||
|
|
"id": "BOV8CvVL4fl-"
|
||
|
|
},
|
||
|
|
"source": [
|
||
|
|
"# text to output ( Please dont use this to generate something that is going to get you in trouble :'( )\n",
|
||
|
|
"text = \"Hi people check this out this man Rida is doing something fun\" # text to say\n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"if embedding is None:\n",
|
||
|
|
" print(\"first record or upload a voice file!\")\n",
|
||
|
|
"else:\n",
|
||
|
|
" print(\"Synthesizing new audio...\")\n",
|
||
|
|
" #with io.capture_output() as captured:\n",
|
||
|
|
" specs = synthesizer.synthesize_spectrograms([text], [embedding])\n",
|
||
|
|
" generated_wav = vocoder.infer_waveform(specs[0])\n",
|
||
|
|
" generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode=\"constant\")\n",
|
||
|
|
" clear_output()\n",
|
||
|
|
" out_audio = Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=True)\n",
|
||
|
|
" display(out_audio)"
|
||
|
|
],
|
||
|
|
"execution_count": null,
|
||
|
|
"outputs": []
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"metadata": {
|
||
|
|
"id": "aAhgj6Z_YixE"
|
||
|
|
},
|
||
|
|
"source": [
|
||
|
|
"# download output audio file ( Please dont use this to generate something that is going to get you in trouble :'( )\n",
|
||
|
|
"with open('out.wav', 'wb') as f:\n",
|
||
|
|
" f.write(out_audio.data)"
|
||
|
|
],
|
||
|
|
"execution_count": null,
|
||
|
|
"outputs": []
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}
|