mirror of
https://github.com/serp-ai/bark-with-voice-clone.git
synced 2025-12-15 03:07:58 +01:00
Add more settings
This commit is contained in:
@@ -9,10 +9,13 @@
|
|||||||
"from bark.generation import load_codec_model, generate_text_semantic\n",
|
"from bark.generation import load_codec_model, generate_text_semantic\n",
|
||||||
"from encodec.utils import convert_audio\n",
|
"from encodec.utils import convert_audio\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"from transformers import BertTokenizer\n",
|
||||||
|
"\n",
|
||||||
"import torchaudio\n",
|
"import torchaudio\n",
|
||||||
"import torch\n",
|
"import torch\n",
|
||||||
"\n",
|
"\n",
|
||||||
"model = load_codec_model(use_gpu=True)"
|
"model = load_codec_model(use_gpu=True)\n",
|
||||||
|
"tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -59,7 +62,7 @@
|
|||||||
"# get seconds of audio\n",
|
"# get seconds of audio\n",
|
||||||
"seconds = wav.shape[-1] / model.sample_rate\n",
|
"seconds = wav.shape[-1] / model.sample_rate\n",
|
||||||
"# generate semantic tokens\n",
|
"# generate semantic tokens\n",
|
||||||
"semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds)"
|
"semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -117,12 +120,49 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from bark.api import generate_audio\n",
|
"from bark.api import generate_audio\n",
|
||||||
"from bark.generation import SAMPLE_RATE\n",
|
"from bark.generation import SAMPLE_RATE\n",
|
||||||
"text_prompt = \"\"\"\n",
|
"text_prompt = \"Hello, my name is Suno. And, uh — and I like pizza. [laughs]\"\n",
|
||||||
" Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n",
|
"voice_name = \"speaker_0\" # use your custom voice name here if you have one"
|
||||||
" But I also have other interests such as playing tic tac toe.\n",
|
]
|
||||||
"\"\"\"\n",
|
},
|
||||||
"voice_name = \"speaker_0\" # use your custom voice name here if you have one\n",
|
{
|
||||||
"audio_array = generate_audio(text_prompt, history_prompt=voice_name)"
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# simple generation\n",
|
||||||
|
"audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# generation with more control\n",
|
||||||
|
"from bark.generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic\n",
|
||||||
|
"x_semantic = generate_text_semantic(\n",
|
||||||
|
" text_prompt,\n",
|
||||||
|
" history_prompt=voice_name,\n",
|
||||||
|
" temp=0.7,\n",
|
||||||
|
" top_k=50,\n",
|
||||||
|
" top_p=0.95,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"x_coarse_gen = generate_coarse(\n",
|
||||||
|
" x_semantic,\n",
|
||||||
|
" history_prompt=voice_name,\n",
|
||||||
|
" temp=0.7,\n",
|
||||||
|
" top_k=50,\n",
|
||||||
|
" top_p=0.95,\n",
|
||||||
|
")\n",
|
||||||
|
"x_fine_gen = generate_fine(\n",
|
||||||
|
" x_coarse_gen,\n",
|
||||||
|
" history_prompt=voice_name,\n",
|
||||||
|
" temp=0.5,\n",
|
||||||
|
")\n",
|
||||||
|
"audio_array = codec_decode(x_fine_gen)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -8,12 +8,49 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from bark.api import generate_audio\n",
|
"from bark.api import generate_audio\n",
|
||||||
"from bark.generation import SAMPLE_RATE\n",
|
"from bark.generation import SAMPLE_RATE\n",
|
||||||
"text_prompt = \"\"\"\n",
|
"text_prompt = \"Hello, my name is Suno. And, uh — and I like pizza. [laughs]\"\n",
|
||||||
" Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n",
|
"voice_name = \"speaker_0\" # use your custom voice name here if you have one"
|
||||||
" But I also have other interests such as playing tic tac toe.\n",
|
]
|
||||||
"\"\"\"\n",
|
},
|
||||||
"voice_name = \"speaker_0\" # use your custom voice name here if you have one\n",
|
{
|
||||||
"audio_array = generate_audio(text_prompt, history_prompt=voice_name)"
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# simple generation\n",
|
||||||
|
"audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# generation with more control\n",
|
||||||
|
"from bark.generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic\n",
|
||||||
|
"x_semantic = generate_text_semantic(\n",
|
||||||
|
" text_prompt,\n",
|
||||||
|
" history_prompt=voice_name,\n",
|
||||||
|
" temp=0.7,\n",
|
||||||
|
" top_k=50,\n",
|
||||||
|
" top_p=0.95,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"x_coarse_gen = generate_coarse(\n",
|
||||||
|
" x_semantic,\n",
|
||||||
|
" history_prompt=voice_name,\n",
|
||||||
|
" temp=0.7,\n",
|
||||||
|
" top_k=50,\n",
|
||||||
|
" top_p=0.95,\n",
|
||||||
|
")\n",
|
||||||
|
"x_fine_gen = generate_fine(\n",
|
||||||
|
" x_coarse_gen,\n",
|
||||||
|
" history_prompt=voice_name,\n",
|
||||||
|
" temp=0.5,\n",
|
||||||
|
")\n",
|
||||||
|
"audio_array = codec_decode(x_fine_gen)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user