mirror of
https://github.com/serp-ai/bark-with-voice-clone.git
synced 2025-12-14 18:57:56 +01:00
174 lines
5.6 KiB
Plaintext
174 lines
5.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from IPython.display import Audio\n",
|
|
"from scipy.io.wavfile import write as write_wav\n",
|
|
"\n",
|
|
"from bark.api import generate_audio\n",
|
|
"from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n",
|
|
"coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n",
|
|
"fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n",
|
|
"use_rvc = True # Set to False to use bark without RVC\n",
|
|
"rvc_name = 'mi-test'\n",
|
|
"rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n",
|
|
"index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\"\n",
|
|
"device=\"cuda:0\"\n",
|
|
"is_half=True"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# download and load all models\n",
|
|
"preload_models(\n",
|
|
" text_use_gpu=True,\n",
|
|
" text_use_small=False,\n",
|
|
" text_model_path=semantic_path,\n",
|
|
" coarse_use_gpu=True,\n",
|
|
" coarse_use_small=False,\n",
|
|
" coarse_model_path=coarse_path,\n",
|
|
" fine_use_gpu=True,\n",
|
|
" fine_use_small=False,\n",
|
|
" fine_model_path=fine_path,\n",
|
|
" codec_use_gpu=True,\n",
|
|
" force_reload=False,\n",
|
|
" path=\"models\"\n",
|
|
")\n",
|
|
"\n",
|
|
"if use_rvc:\n",
|
|
" from rvc_infer import get_vc, vc_single\n",
|
|
" get_vc(rvc_path, device, is_half)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# simple generation\n",
|
|
"text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
|
|
"voice_name = \"speaker_0\" # use your custom voice name here if you have on\n",
|
|
"\n",
|
|
"filepath = \"output/audio.wav\"\n",
|
|
"audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)\n",
|
|
"write_wav(filepath, SAMPLE_RATE, audio_array)\n",
|
|
"\n",
|
|
"if use_rvc:\n",
|
|
" index_rate = 0.75\n",
|
|
" f0up_key = -6\n",
|
|
" filter_radius = 3\n",
|
|
" rms_mix_rate = 0.25\n",
|
|
" protect = 0.33\n",
|
|
" resample_sr = SAMPLE_RATE\n",
|
|
" f0method = \"harvest\" #harvest or pm\n",
|
|
" try:\n",
|
|
" audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
|
|
" except:\n",
|
|
" audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
|
|
" write_wav(filepath, SAMPLE_RATE, audio_array)\n",
|
|
"\n",
|
|
"Audio(audio_array, rate=SAMPLE_RATE)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# generation with more control\n",
|
|
"text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
|
|
"voice_name = \"speaker_0\" # use your custom voice name here if you have on\n",
|
|
"\n",
|
|
"filepath = \"output/audio.wav\"\n",
|
|
"\n",
|
|
"x_semantic = generate_text_semantic(\n",
|
|
" text_prompt,\n",
|
|
" history_prompt=voice_name,\n",
|
|
" temp=0.7,\n",
|
|
" top_k=50,\n",
|
|
" top_p=0.95,\n",
|
|
")\n",
|
|
"\n",
|
|
"x_coarse_gen = generate_coarse(\n",
|
|
" x_semantic,\n",
|
|
" history_prompt=voice_name,\n",
|
|
" temp=0.7,\n",
|
|
" top_k=50,\n",
|
|
" top_p=0.95,\n",
|
|
")\n",
|
|
"x_fine_gen = generate_fine(\n",
|
|
" x_coarse_gen,\n",
|
|
" history_prompt=voice_name,\n",
|
|
" temp=0.5,\n",
|
|
")\n",
|
|
"audio_array = codec_decode(x_fine_gen)\n",
|
|
"write_wav(filepath, SAMPLE_RATE, audio_array)\n",
|
|
"\n",
|
|
"if use_rvc:\n",
|
|
" index_rate = 0.75\n",
|
|
" f0up_key = -6\n",
|
|
" filter_radius = 3\n",
|
|
" rms_mix_rate = 0.25\n",
|
|
" protect = 0.33\n",
|
|
" resample_sr = SAMPLE_RATE\n",
|
|
" f0method = \"harvest\" #harvest or pm\n",
|
|
" try:\n",
|
|
" audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
|
|
" except:\n",
|
|
" audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
|
|
" write_wav(filepath, SAMPLE_RATE, audio_array)\n",
|
|
"\n",
|
|
"Audio(audio_array, rate=SAMPLE_RATE)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.8"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|