diff --git a/bark/generation.py b/bark/generation.py index 2b4fabe..7b5ff6e 100644 --- a/bark/generation.py +++ b/bark/generation.py @@ -48,28 +48,6 @@ COARSE_RATE_HZ = 75 SAMPLE_RATE = 24_000 -SUPPORTED_LANGS = [ - ("English", "en"), - ("German", "de"), - ("Spanish", "es"), - ("French", "fr"), - ("Hindi", "hi"), - ("Italian", "it"), - ("Japanese", "ja"), - ("Korean", "ko"), - ("Polish", "pl"), - ("Portuguese", "pt"), - ("Russian", "ru"), - ("Turkish", "tr"), - ("Chinese", "zh"), -] - -ALLOWED_PROMPTS = {"announcer"} -for _, lang in SUPPORTED_LANGS: - for n in range(10): - ALLOWED_PROMPTS.add(f"{lang}_speaker_{n}") - - logger = logging.getLogger(__name__) @@ -348,7 +326,6 @@ def generate_text_semantic( text = _normalize_whitespace(text) assert len(text.strip()) > 0 if history_prompt is not None: - assert (history_prompt in ALLOWED_PROMPTS) semantic_history = np.load( os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz") )["semantic_prompt"] @@ -492,7 +469,6 @@ def generate_coarse( semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio)) if history_prompt is not None: - assert (history_prompt in ALLOWED_PROMPTS) x_history = np.load( os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz") ) @@ -635,7 +611,6 @@ def generate_fine( and x_coarse_gen.max() <= CODEBOOK_SIZE - 1 ) if history_prompt is not None: - assert (history_prompt in ALLOWED_PROMPTS) x_fine_history = np.load( os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz") )["fine_prompt"] diff --git a/clone_voice.ipynb b/clone_voice.ipynb new file mode 100644 index 0000000..2f4b375 --- /dev/null +++ b/clone_voice.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bark.generation import codec_encode, load_codec_model, generate_text_semantic\n", + "from encodec.utils import convert_audio\n", + "\n", + "import torchaudio\n", + "import torch\n", + "\n", + "model = load_codec_model(use_gpu=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load and pre-process the audio waveform\n", + "audio_filepath = 'audio.wav' # the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)\n", + "wav, sr = torchaudio.load(audio_filepath)\n", + "wav = convert_audio(wav, sr, model.sample_rate, model.channels)\n", + "wav = wav.unsqueeze(0).to('cuda')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract discrete codes from EnCodec\n", + "with torch.no_grad():\n", + " encoded_frames = model.encode(wav)\n", + "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = \"Transcription of the audio you are cloning\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get seconds of audio\n", + "seconds = wav.shape[-1] / model.sample_rate\n", + "# generate semantic tokens\n", + "semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# move codes to cpu\n", + "codes = codes.cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "voice_name = 'output' # whatever you want the name of the voice to be\n", + "output_path = 'bark/assets/prompts/' + voice_name + '.npz'\n", + "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Heres the generation stuff copy-pasted for convenience" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bark.api import generate_audio\n", + "from bark.generation import SAMPLE_RATE\n", + "text_prompt = \"\"\"\n", + " Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n", + " But I also have other interests such as playing tic tac toe.\n", + "\"\"\"\n", + "voice_name = \"speaker_0\" # use your custom voice name here if you have one\n", + "audio_array = generate_audio(text_prompt, history_prompt=voice_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Audio\n", + "# play audio\n", + "Audio(audio_array, rate=SAMPLE_RATE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.io.wavfile import write as write_wav\n", + "# save audio\n", + "filepath = \"/output/audio.wav\" # change this to your desired output path\n", + "write_wav(filepath, SAMPLE_RATE, audio_array)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/generate.ipynb b/generate.ipynb new file mode 100644 index 0000000..115506b --- /dev/null +++ b/generate.ipynb @@ -0,0 +1,65 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bark.api import generate_audio\n", + "from bark.generation import SAMPLE_RATE\n", + "text_prompt = \"\"\"\n", + " Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n", + " But I also have other interests such as playing tic tac toe.\n", + "\"\"\"\n", + "voice_name = \"speaker_0\" # use your custom voice name here if you have one\n", + "audio_array = generate_audio(text_prompt, history_prompt=voice_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Audio\n", + "# play audio\n", + "Audio(audio_array, rate=SAMPLE_RATE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.io.wavfile import write as write_wav\n", + "# save audio\n", + "filepath = \"/output/audio.wav\" # change this to your desired output path\n", + "write_wav(filepath, SAMPLE_RATE, audio_array)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}