bark-with-voice-clone/clone_voice.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bark.generation import load_codec_model, generate_text_semantic\n",
    "from encodec.utils import convert_audio\n",
    "\n",
    "from transformers import BertTokenizer\n",
    "\n",
    "import torchaudio\n",
    "import torch\n",
    "\n",
    "model = load_codec_model(use_gpu=True)\n",
    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load and pre-process the audio waveform\n",
    "audio_filepath = 'audio.wav' # the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)\n",
    "device = 'cuda' # or 'cpu'\n",
    "wav, sr = torchaudio.load(audio_filepath)\n",
    "wav = convert_audio(wav, sr, model.sample_rate, model.channels)\n",
    "wav = wav.unsqueeze(0).to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract discrete codes from EnCodec\n",
    "with torch.no_grad():\n",
    "    encoded_frames = model.encode(wav)\n",
    "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"Transcription of the audio you are cloning\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get seconds of audio\n",
    "seconds = wav.shape[-1] / model.sample_rate\n",
    "# generate semantic tokens\n",
    "semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# move codes to cpu\n",
    "codes = codes.cpu().numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "voice_name = 'output' # whatever you want the name of the voice to be\n",
    "output_path = 'bark/assets/prompts/' + voice_name + '.npz'\n",
    "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Heres the generation stuff copy-pasted for convenience"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bark.api import generate_audio\n",
    "from bark.generation import SAMPLE_RATE\n",
    "text_prompt = \"Hello, my name is Suno. And, uh — and I like pizza. [laughs]\"\n",
    "voice_name = \"speaker_0\" # use your custom voice name here if you have one"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# simple generation\n",
    "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generation with more control\n",
    "from bark.generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic\n",
    "x_semantic = generate_text_semantic(\n",
    "    text_prompt,\n",
    "    history_prompt=voice_name,\n",
    "    temp=0.7,\n",
    "    top_k=50,\n",
    "    top_p=0.95,\n",
    ")\n",
    "\n",
    "x_coarse_gen = generate_coarse(\n",
    "    x_semantic,\n",
    "    history_prompt=voice_name,\n",
    "    temp=0.7,\n",
    "    top_k=50,\n",
    "    top_p=0.95,\n",
    ")\n",
    "x_fine_gen = generate_fine(\n",
    "    x_coarse_gen,\n",
    "    history_prompt=voice_name,\n",
    "    temp=0.5,\n",
    ")\n",
    "audio_array = codec_decode(x_fine_gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Audio\n",
    "# play audio\n",
    "Audio(audio_array, rate=SAMPLE_RATE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.io.wavfile import write as write_wav\n",
    "# save audio\n",
    "filepath = \"/output/audio.wav\" # change this to your desired output path\n",
    "write_wav(filepath, SAMPLE_RATE, audio_array)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}