Files
bark-with-voice-clone/clone_voice.ipynb
2023-05-25 16:24:41 -06:00

256 lines
6.7 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bark.generation import load_codec_model, generate_text_semantic\n",
"from encodec.utils import convert_audio\n",
"\n",
"import torchaudio\n",
"import torch\n",
"\n",
"device = 'cuda' # or 'cpu'\n",
"model = load_codec_model(use_gpu=True if device == 'cuda' else False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n",
"from hubert.hubert_manager import HuBERTManager\n",
"hubert_manager = HuBERTManager()\n",
"hubert_manager.make_sure_hubert_installed()\n",
"hubert_manager.make_sure_tokenizer_installed()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer \n",
"# Load HuBERT for semantic tokens\n",
"from hubert.pre_kmeans_hubert import CustomHubert\n",
"from hubert.customtokenizer import CustomTokenizer\n",
"\n",
"# Load the HuBERT model\n",
"hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)\n",
"\n",
"# Load the CustomTokenizer model\n",
"tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device) # Automatically uses the right layers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load and pre-process the audio waveform\n",
"audio_filepath = 'audio.wav' # the audio you want to clone (under 13 seconds)\n",
"wav, sr = torchaudio.load(audio_filepath)\n",
"wav = convert_audio(wav, sr, model.sample_rate, model.channels)\n",
"wav = wav.to(device)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)\n",
"semantic_tokens = tokenizer.get_token(semantic_vectors)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extract discrete codes from EnCodec\n",
"with torch.no_grad():\n",
" encoded_frames = model.encode(wav.unsqueeze(0))\n",
"codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# move codes to cpu\n",
"codes = codes.cpu().numpy()\n",
"# move semantic tokens to cpu\n",
"semantic_tokens = semantic_tokens.cpu().numpy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"voice_name = 'output' # whatever you want the name of the voice to be\n",
"output_path = 'bark/assets/prompts/' + voice_name + '.npz'\n",
"np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Heres the generation stuff copy-pasted for convenience"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bark.api import generate_audio\n",
"from transformers import BertTokenizer\n",
"from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic\n",
"\n",
"# Enter your prompt and speaker here\n",
"text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
"voice_name = \"output\" # use your custom voice name here if you have one"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# download and load all models\n",
"preload_models(\n",
" text_use_gpu=True,\n",
" text_use_small=False,\n",
" coarse_use_gpu=True,\n",
" coarse_use_small=False,\n",
" fine_use_gpu=True,\n",
" fine_use_small=False,\n",
" codec_use_gpu=True,\n",
" force_reload=False,\n",
" path=\"models\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# simple generation\n",
"audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# generation with more control\n",
"x_semantic = generate_text_semantic(\n",
" text_prompt,\n",
" history_prompt=voice_name,\n",
" temp=0.7,\n",
" top_k=50,\n",
" top_p=0.95,\n",
")\n",
"\n",
"x_coarse_gen = generate_coarse(\n",
" x_semantic,\n",
" history_prompt=voice_name,\n",
" temp=0.7,\n",
" top_k=50,\n",
" top_p=0.95,\n",
")\n",
"x_fine_gen = generate_fine(\n",
" x_coarse_gen,\n",
" history_prompt=voice_name,\n",
" temp=0.5,\n",
")\n",
"audio_array = codec_decode(x_fine_gen)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Audio\n",
"# play audio\n",
"Audio(audio_array, rate=SAMPLE_RATE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from scipy.io.wavfile import write as write_wav\n",
"# save audio\n",
"filepath = \"/output/audio.wav\" # change this to your desired output path\n",
"write_wav(filepath, SAMPLE_RATE, audio_array)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}