diff --git a/clone_voice.ipynb b/clone_voice.ipynb index 5fab639..a5e3391 100644 --- a/clone_voice.ipynb +++ b/clone_voice.ipynb @@ -9,10 +9,13 @@ "from bark.generation import load_codec_model, generate_text_semantic\n", "from encodec.utils import convert_audio\n", "\n", + "from transformers import BertTokenizer\n", + "\n", "import torchaudio\n", "import torch\n", "\n", - "model = load_codec_model(use_gpu=True)" + "model = load_codec_model(use_gpu=True)\n", + "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")" ] }, { @@ -59,7 +62,7 @@ "# get seconds of audio\n", "seconds = wav.shape[-1] / model.sample_rate\n", "# generate semantic tokens\n", - "semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds)" + "semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7)" ] }, { @@ -117,12 +120,49 @@ "source": [ "from bark.api import generate_audio\n", "from bark.generation import SAMPLE_RATE\n", - "text_prompt = \"\"\"\n", - " Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n", - " But I also have other interests such as playing tic tac toe.\n", - "\"\"\"\n", - "voice_name = \"speaker_0\" # use your custom voice name here if you have one\n", - "audio_array = generate_audio(text_prompt, history_prompt=voice_name)" + "text_prompt = \"Hello, my name is Suno. And, uh — and I like pizza. [laughs]\"\n", + "voice_name = \"speaker_0\" # use your custom voice name here if you have one" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# simple generation\n", + "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# generation with more control\n", + "from bark.generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic\n", + "x_semantic = generate_text_semantic(\n", + " text_prompt,\n", + " history_prompt=voice_name,\n", + " temp=0.7,\n", + " top_k=50,\n", + " top_p=0.95,\n", + ")\n", + "\n", + "x_coarse_gen = generate_coarse(\n", + " x_semantic,\n", + " history_prompt=voice_name,\n", + " temp=0.7,\n", + " top_k=50,\n", + " top_p=0.95,\n", + ")\n", + "x_fine_gen = generate_fine(\n", + " x_coarse_gen,\n", + " history_prompt=voice_name,\n", + " temp=0.5,\n", + ")\n", + "audio_array = codec_decode(x_fine_gen)" ] }, { diff --git a/generate.ipynb b/generate.ipynb index 115506b..4f2bb89 100644 --- a/generate.ipynb +++ b/generate.ipynb @@ -8,12 +8,49 @@ "source": [ "from bark.api import generate_audio\n", "from bark.generation import SAMPLE_RATE\n", - "text_prompt = \"\"\"\n", - " Hello, my name is Suno. And, uh — and I like pizza. [laughs] \n", - " But I also have other interests such as playing tic tac toe.\n", - "\"\"\"\n", - "voice_name = \"speaker_0\" # use your custom voice name here if you have one\n", - "audio_array = generate_audio(text_prompt, history_prompt=voice_name)" + "text_prompt = \"Hello, my name is Suno. And, uh — and I like pizza. [laughs]\"\n", + "voice_name = \"speaker_0\" # use your custom voice name here if you have one" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# simple generation\n", + "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# generation with more control\n", + "from bark.generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic\n", + "x_semantic = generate_text_semantic(\n", + " text_prompt,\n", + " history_prompt=voice_name,\n", + " temp=0.7,\n", + " top_k=50,\n", + " top_p=0.95,\n", + ")\n", + "\n", + "x_coarse_gen = generate_coarse(\n", + " x_semantic,\n", + " history_prompt=voice_name,\n", + " temp=0.7,\n", + " top_k=50,\n", + " top_p=0.95,\n", + ")\n", + "x_fine_gen = generate_fine(\n", + " x_coarse_gen,\n", + " history_prompt=voice_name,\n", + " temp=0.5,\n", + ")\n", + "audio_array = codec_decode(x_fine_gen)" ] }, {