test_models.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bark.api import generate_audio\n",
    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "semantic_path = \"E:/Python/bark-with-voice-clone/semantic_output/pytorch_model.bin\"\n",
    "coarse_path = \"E:/Python/bark-with-voice-clone/coarse_output/pytorch_model.bin\"\n",
    "fine_path = \"E:/Python/bark-with-voice-clone/fine_output/pytorch_model.bin\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preload_models(\n",
    "    text_use_gpu=True,\n",
    "    text_use_small=False,\n",
    "    text_model_path=semantic_path,\n",
    "    coarse_use_gpu=True,\n",
    "    coarse_use_small=False,\n",
    "    coarse_model_path=coarse_path,\n",
    "    fine_use_gpu=True,\n",
    "    fine_use_small=False,\n",
    "    fine_model_path=fine_path,\n",
    "    codec_use_gpu=True,\n",
    "    force_reload=False,\n",
    "    path=\"models\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# simple generation\n",
    "text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! [laughs] A lot better than the original!\"\n",
    "audio_array = generate_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Audio\n",
    "# play audio\n",
    "Audio(audio_array, rate=SAMPLE_RATE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.io.wavfile import write as write_wav\n",
    "# save audio\n",
    "filepath = \"output/audio.wav\" # change this to your desired output path\n",
    "write_wav(filepath, SAMPLE_RATE, audio_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):\n",
    "    # generation with more control\n",
    "    x_semantic = generate_text_semantic(\n",
    "        text_prompt,\n",
    "        history_prompt=voice_name if use_semantic_history_prompt else None,\n",
    "        temp=semantic_temp,\n",
    "        top_k=semantic_top_k,\n",
    "        top_p=semantic_top_p,\n",
    "    )\n",
    "\n",
    "    x_coarse_gen = generate_coarse(\n",
    "        x_semantic,\n",
    "        history_prompt=voice_name if use_coarse_history_prompt else None,\n",
    "        temp=coarse_temp,\n",
    "        top_k=coarse_top_k,\n",
    "        top_p=coarse_top_p,\n",
    "    )\n",
    "    x_fine_gen = generate_fine(\n",
    "        x_coarse_gen,\n",
    "        history_prompt=voice_name if use_fine_history_prompt else None,\n",
    "        temp=fine_temp,\n",
    "    )\n",
    "\n",
    "    if output_full:\n",
    "        full_generation = {\n",
    "            'semantic_prompt': x_semantic,\n",
    "            'coarse_prompt': x_coarse_gen,\n",
    "            'fine_prompt': x_fine_gen,\n",
    "        }\n",
    "        return full_generation, codec_decode(x_fine_gen)\n",
    "    return codec_decode(x_fine_gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! [laughs] A lot better than the original!\"\n",
    "\n",
    "audio_array = generate_with_settings(\n",
    "    text_prompt,\n",
    "    semantic_temp=0.7,\n",
    "    semantic_top_k=50,\n",
    "    semantic_top_p=0.99,\n",
    "    coarse_temp=0.7,\n",
    "    coarse_top_k=50,\n",
    "    coarse_top_p=0.99,\n",
    "    fine_temp=0.5,\n",
    "    voice_name=None,\n",
    "    use_semantic_history_prompt=True,\n",
    "    use_coarse_history_prompt=True,\n",
    "    use_fine_history_prompt=True,\n",
    "    output_full=False\n",
    ")\n",
    "\n",
    "from IPython.display import Audio\n",
    "# play audio\n",
    "Audio(audio_array, rate=SAMPLE_RATE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.io.wavfile import write as write_wav\n",
    "# save audio\n",
    "filepath = \"output/audio.wav\" # change this to your desired output path\n",
    "write_wav(filepath, SAMPLE_RATE, audio_array)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
Add v1 finetune support 2023-06-29 21:48:18 -06:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from bark.api import generate_audio\n",`
			`"from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"semantic_path = \"E:/Python/bark-with-voice-clone/semantic_output/pytorch_model.bin\"\n",`
			`"coarse_path = \"E:/Python/bark-with-voice-clone/coarse_output/pytorch_model.bin\"\n",`
			`"fine_path = \"E:/Python/bark-with-voice-clone/fine_output/pytorch_model.bin\""`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"preload_models(\n",`
			`" text_use_gpu=True,\n",`
			`" text_use_small=False,\n",`
			`" text_model_path=semantic_path,\n",`
			`" coarse_use_gpu=True,\n",`
			`" coarse_use_small=False,\n",`
			`" coarse_model_path=coarse_path,\n",`
			`" fine_use_gpu=True,\n",`
			`" fine_use_small=False,\n",`
			`" fine_model_path=fine_path,\n",`
			`" codec_use_gpu=True,\n",`
			`" force_reload=False,\n",`
			`" path=\"models\"\n",`
			`")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# simple generation\n",`
			`"text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! [laughs] A lot better than the original!\"\n",`
			`"audio_array = generate_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from IPython.display import Audio\n",`
			`"# play audio\n",`
			`"Audio(audio_array, rate=SAMPLE_RATE)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from scipy.io.wavfile import write as write_wav\n",`
			`"# save audio\n",`
			`"filepath = \"output/audio.wav\" # change this to your desired output path\n",`
			`"write_wav(filepath, SAMPLE_RATE, audio_array)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):\n",`
			`" # generation with more control\n",`
			`" x_semantic = generate_text_semantic(\n",`
			`" text_prompt,\n",`
			`" history_prompt=voice_name if use_semantic_history_prompt else None,\n",`
			`" temp=semantic_temp,\n",`
			`" top_k=semantic_top_k,\n",`
			`" top_p=semantic_top_p,\n",`
			`" )\n",`
			`"\n",`
			`" x_coarse_gen = generate_coarse(\n",`
			`" x_semantic,\n",`
			`" history_prompt=voice_name if use_coarse_history_prompt else None,\n",`
			`" temp=coarse_temp,\n",`
			`" top_k=coarse_top_k,\n",`
			`" top_p=coarse_top_p,\n",`
			`" )\n",`
			`" x_fine_gen = generate_fine(\n",`
			`" x_coarse_gen,\n",`
			`" history_prompt=voice_name if use_fine_history_prompt else None,\n",`
			`" temp=fine_temp,\n",`
			`" )\n",`
			`"\n",`
			`" if output_full:\n",`
			`" full_generation = {\n",`
			`" 'semantic_prompt': x_semantic,\n",`
			`" 'coarse_prompt': x_coarse_gen,\n",`
			`" 'fine_prompt': x_fine_gen,\n",`
			`" }\n",`
			`" return full_generation, codec_decode(x_fine_gen)\n",`
			`" return codec_decode(x_fine_gen)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! [laughs] A lot better than the original!\"\n",`
			`"\n",`
			`"audio_array = generate_with_settings(\n",`
			`" text_prompt,\n",`
			`" semantic_temp=0.7,\n",`
			`" semantic_top_k=50,\n",`
			`" semantic_top_p=0.99,\n",`
			`" coarse_temp=0.7,\n",`
			`" coarse_top_k=50,\n",`
			`" coarse_top_p=0.99,\n",`
			`" fine_temp=0.5,\n",`
			`" voice_name=None,\n",`
			`" use_semantic_history_prompt=True,\n",`
			`" use_coarse_history_prompt=True,\n",`
			`" use_fine_history_prompt=True,\n",`
			`" output_full=False\n",`
			`")\n",`
			`"\n",`
			`"from IPython.display import Audio\n",`
			`"# play audio\n",`
			`"Audio(audio_array, rate=SAMPLE_RATE)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from scipy.io.wavfile import write as write_wav\n",`
			`"# save audio\n",`
			`"filepath = \"output/audio.wav\" # change this to your desired output path\n",`
			`"write_wav(filepath, SAMPLE_RATE, audio_array)"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.10.8"`
			`},`
			`"orig_nbformat": 4`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`