mirror of
https://github.com/serp-ai/bark-with-voice-clone.git
synced 2025-12-15 03:07:58 +01:00
Remove restrictions, allow voice cloning
This commit is contained in:
@@ -48,28 +48,6 @@ COARSE_RATE_HZ = 75
|
|||||||
SAMPLE_RATE = 24_000
|
SAMPLE_RATE = 24_000
|
||||||
|
|
||||||
|
|
||||||
SUPPORTED_LANGS = [
|
|
||||||
("English", "en"),
|
|
||||||
("German", "de"),
|
|
||||||
("Spanish", "es"),
|
|
||||||
("French", "fr"),
|
|
||||||
("Hindi", "hi"),
|
|
||||||
("Italian", "it"),
|
|
||||||
("Japanese", "ja"),
|
|
||||||
("Korean", "ko"),
|
|
||||||
("Polish", "pl"),
|
|
||||||
("Portuguese", "pt"),
|
|
||||||
("Russian", "ru"),
|
|
||||||
("Turkish", "tr"),
|
|
||||||
("Chinese", "zh"),
|
|
||||||
]
|
|
||||||
|
|
||||||
ALLOWED_PROMPTS = {"announcer"}
|
|
||||||
for _, lang in SUPPORTED_LANGS:
|
|
||||||
for n in range(10):
|
|
||||||
ALLOWED_PROMPTS.add(f"{lang}_speaker_{n}")
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -348,7 +326,6 @@ def generate_text_semantic(
|
|||||||
text = _normalize_whitespace(text)
|
text = _normalize_whitespace(text)
|
||||||
assert len(text.strip()) > 0
|
assert len(text.strip()) > 0
|
||||||
if history_prompt is not None:
|
if history_prompt is not None:
|
||||||
assert (history_prompt in ALLOWED_PROMPTS)
|
|
||||||
semantic_history = np.load(
|
semantic_history = np.load(
|
||||||
os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
|
os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
|
||||||
)["semantic_prompt"]
|
)["semantic_prompt"]
|
||||||
@@ -492,7 +469,6 @@ def generate_coarse(
|
|||||||
semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
|
semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
|
||||||
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
|
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
|
||||||
if history_prompt is not None:
|
if history_prompt is not None:
|
||||||
assert (history_prompt in ALLOWED_PROMPTS)
|
|
||||||
x_history = np.load(
|
x_history = np.load(
|
||||||
os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
|
os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
|
||||||
)
|
)
|
||||||
@@ -635,7 +611,6 @@ def generate_fine(
|
|||||||
and x_coarse_gen.max() <= CODEBOOK_SIZE - 1
|
and x_coarse_gen.max() <= CODEBOOK_SIZE - 1
|
||||||
)
|
)
|
||||||
if history_prompt is not None:
|
if history_prompt is not None:
|
||||||
assert (history_prompt in ALLOWED_PROMPTS)
|
|
||||||
x_fine_history = np.load(
|
x_fine_history = np.load(
|
||||||
os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
|
os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
|
||||||
)["fine_prompt"]
|
)["fine_prompt"]
|
||||||
|
|||||||
173
clone_voice.ipynb
Normal file
173
clone_voice.ipynb
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from bark.generation import codec_encode, load_codec_model, generate_text_semantic\n",
|
||||||
|
"from encodec.utils import convert_audio\n",
|
||||||
|
"\n",
|
||||||
|
"import torchaudio\n",
|
||||||
|
"import torch\n",
|
||||||
|
"\n",
|
||||||
|
"model = load_codec_model(use_gpu=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Load and pre-process the audio waveform\n",
|
||||||
|
"audio_filepath = 'audio.wav' # the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)\n",
|
||||||
|
"wav, sr = torchaudio.load(audio_filepath)\n",
|
||||||
|
"wav = convert_audio(wav, sr, model.sample_rate, model.channels)\n",
|
||||||
|
"wav = wav.unsqueeze(0).to('cuda')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Extract discrete codes from EnCodec\n",
|
||||||
|
"with torch.no_grad():\n",
|
||||||
|
" encoded_frames = model.encode(wav)\n",
|
||||||
|
"codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"text = \"Transcription of the audio you are cloning\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# get seconds of audio\n",
|
||||||
|
"seconds = wav.shape[-1] / model.sample_rate\n",
|
||||||
|
"# generate semantic tokens\n",
|
||||||
|
"semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# move codes to cpu\n",
|
||||||
|
"codes = codes.cpu().numpy()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"voice_name = 'output' # whatever you want the name of the voice to be\n",
|
||||||
|
"output_path = 'bark/assets/prompts/' + voice_name + '.npz'\n",
|
||||||
|
"np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Heres the generation stuff copy-pasted for convenience"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from bark.api import generate_audio\n",
|
||||||
|
"from bark.generation import SAMPLE_RATE\n",
|
||||||
|
"text_prompt = \"\"\"\n",
|
||||||
|
" Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n",
|
||||||
|
" But I also have other interests such as playing tic tac toe.\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"voice_name = \"speaker_0\" # use your custom voice name here if you have one\n",
|
||||||
|
"audio_array = generate_audio(text_prompt, history_prompt=voice_name)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from IPython.display import Audio\n",
|
||||||
|
"# play audio\n",
|
||||||
|
"Audio(audio_array, rate=SAMPLE_RATE)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from scipy.io.wavfile import write as write_wav\n",
|
||||||
|
"# save audio\n",
|
||||||
|
"filepath = \"/output/audio.wav\" # change this to your desired output path\n",
|
||||||
|
"write_wav(filepath, SAMPLE_RATE, audio_array)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.8"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
65
generate.ipynb
Normal file
65
generate.ipynb
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from bark.api import generate_audio\n",
|
||||||
|
"from bark.generation import SAMPLE_RATE\n",
|
||||||
|
"text_prompt = \"\"\"\n",
|
||||||
|
" Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n",
|
||||||
|
" But I also have other interests such as playing tic tac toe.\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"voice_name = \"speaker_0\" # use your custom voice name here if you have one\n",
|
||||||
|
"audio_array = generate_audio(text_prompt, history_prompt=voice_name)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from IPython.display import Audio\n",
|
||||||
|
"# play audio\n",
|
||||||
|
"Audio(audio_array, rate=SAMPLE_RATE)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from scipy.io.wavfile import write as write_wav\n",
|
||||||
|
"# save audio\n",
|
||||||
|
"filepath = \"/output/audio.wav\" # change this to your desired output path\n",
|
||||||
|
"write_wav(filepath, SAMPLE_RATE, audio_array)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.8"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user