Remove restrictions, allow voice cloning

2025-12-15 03:07:58 +01:00 · 2023-04-21 09:05:02 -06:00
parent c03e58a586
commit 05abd532cd
3 changed files with 238 additions and 25 deletions
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -48,28 +48,6 @@ COARSE_RATE_HZ = 75
 SAMPLE_RATE = 24_000


-SUPPORTED_LANGS = [
-    ("English", "en"),
-    ("German", "de"),
-    ("Spanish", "es"),
-    ("French", "fr"),
-    ("Hindi", "hi"),
-    ("Italian", "it"),
-    ("Japanese", "ja"),
-    ("Korean", "ko"),
-    ("Polish", "pl"),
-    ("Portuguese", "pt"),
-    ("Russian", "ru"),
-    ("Turkish", "tr"),
-    ("Chinese", "zh"),
-]
-
-ALLOWED_PROMPTS = {"announcer"}
-for _, lang in SUPPORTED_LANGS:
-    for n in range(10):
-        ALLOWED_PROMPTS.add(f"{lang}_speaker_{n}")
-
-
 logger = logging.getLogger(__name__)


@@ -348,7 +326,6 @@ def generate_text_semantic(
    text = _normalize_whitespace(text)
    assert len(text.strip()) > 0
    if history_prompt is not None:
-        assert (history_prompt in ALLOWED_PROMPTS)
        semantic_history = np.load(
            os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
        )["semantic_prompt"]
@@ -492,7 +469,6 @@ def generate_coarse(
    semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
    max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
    if history_prompt is not None:
-        assert (history_prompt in ALLOWED_PROMPTS)
        x_history = np.load(
            os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
        )
@@ -635,7 +611,6 @@ def generate_fine(
        and x_coarse_gen.max() <= CODEBOOK_SIZE - 1
    )
    if history_prompt is not None:
-        assert (history_prompt in ALLOWED_PROMPTS)
        x_fine_history = np.load(
            os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
        )["fine_prompt"]
--- a/clone_voice.ipynb
+++ b/clone_voice.ipynb
@@ -0,0 +1,173 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bark.generation import codec_encode, load_codec_model, generate_text_semantic\n",
+    "from encodec.utils import convert_audio\n",
+    "\n",
+    "import torchaudio\n",
+    "import torch\n",
+    "\n",
+    "model = load_codec_model(use_gpu=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load and pre-process the audio waveform\n",
+    "audio_filepath = 'audio.wav' # the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)\n",
+    "wav, sr = torchaudio.load(audio_filepath)\n",
+    "wav = convert_audio(wav, sr, model.sample_rate, model.channels)\n",
+    "wav = wav.unsqueeze(0).to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract discrete codes from EnCodec\n",
+    "with torch.no_grad():\n",
+    "    encoded_frames = model.encode(wav)\n",
+    "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"Transcription of the audio you are cloning\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get seconds of audio\n",
+    "seconds = wav.shape[-1] / model.sample_rate\n",
+    "# generate semantic tokens\n",
+    "semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# move codes to cpu\n",
+    "codes = codes.cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "voice_name = 'output' # whatever you want the name of the voice to be\n",
+    "output_path = 'bark/assets/prompts/' + voice_name + '.npz'\n",
+    "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Heres the generation stuff copy-pasted for convenience"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bark.api import generate_audio\n",
+    "from bark.generation import SAMPLE_RATE\n",
+    "text_prompt = \"\"\"\n",
+    "    Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n",
+    "    But I also have other interests such as playing tic tac toe.\n",
+    "\"\"\"\n",
+    "voice_name = \"speaker_0\" # use your custom voice name here if you have one\n",
+    "audio_array = generate_audio(text_prompt, history_prompt=voice_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "# play audio\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scipy.io.wavfile import write as write_wav\n",
+    "# save audio\n",
+    "filepath = \"/output/audio.wav\" # change this to your desired output path\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/generate.ipynb
+++ b/generate.ipynb
@@ -0,0 +1,65 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bark.api import generate_audio\n",
+    "from bark.generation import SAMPLE_RATE\n",
+    "text_prompt = \"\"\"\n",
+    "    Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n",
+    "    But I also have other interests such as playing tic tac toe.\n",
+    "\"\"\"\n",
+    "voice_name = \"speaker_0\" # use your custom voice name here if you have one\n",
+    "audio_array = generate_audio(text_prompt, history_prompt=voice_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "# play audio\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scipy.io.wavfile import write as write_wav\n",
+    "# save audio\n",
+    "filepath = \"/output/audio.wav\" # change this to your desired output path\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}