Add RVC support

2025-12-16 11:48:09 +01:00 · 2023-07-19 19:12:27 -06:00
parent ff1e45fcb8
commit c87b3c81fb
10 changed files with 764 additions and 364 deletions
--- a/generate.ipynb
+++ b/generate.ipynb
@@ -2,25 +2,37 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "\n",
    "from bark.api import generate_audio\n",
-    "from transformers import BertTokenizer\n",
-    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic\n",
-    "\n",
-    "# Enter your prompt and speaker here\n",
-    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
-    "voice_name = \"speaker_0\" # use your custom voice name here if you have one\n",
-    "\n",
-    "# load the tokenizer\n",
-    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")"
+    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n",
+    "coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n",
+    "fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n",
+    "use_rvc = True # Set to False to use bark without RVC\n",
+    "rvc_name = 'mi-test'\n",
+    "rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n",
+    "index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\"\n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -28,14 +40,21 @@
    "preload_models(\n",
    "    text_use_gpu=True,\n",
    "    text_use_small=False,\n",
+    "    text_model_path=semantic_path,\n",
    "    coarse_use_gpu=True,\n",
    "    coarse_use_small=False,\n",
+    "    coarse_model_path=coarse_path,\n",
    "    fine_use_gpu=True,\n",
    "    fine_use_small=False,\n",
+    "    fine_model_path=fine_path,\n",
    "    codec_use_gpu=True,\n",
    "    force_reload=False,\n",
    "    path=\"models\"\n",
-    ")"
+    ")\n",
+    "\n",
+    "if use_rvc:\n",
+    "    from rvc_infer import get_vc, vc_single\n",
+    "    get_vc(rvc_path, device, is_half)"
   ]
  },
  {
@@ -45,7 +64,28 @@
   "outputs": [],
   "source": [
    "# simple generation\n",
-    "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)"
+    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
+    "voice_name = \"speaker_0\" # use your custom voice name here if you have on\n",
+    "\n",
+    "filepath = \"output/audio.wav\"\n",
+    "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
   ]
  },
  {
@@ -55,6 +95,11 @@
   "outputs": [],
   "source": [
    "# generation with more control\n",
+    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
+    "voice_name = \"speaker_0\" # use your custom voice name here if you have on\n",
+    "\n",
+    "filepath = \"output/audio.wav\"\n",
+    "\n",
    "x_semantic = generate_text_semantic(\n",
    "    text_prompt,\n",
    "    history_prompt=voice_name,\n",
@@ -75,32 +120,26 @@
    "    history_prompt=voice_name,\n",
    "    temp=0.5,\n",
    ")\n",
-    "audio_array = codec_decode(x_fine_gen)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.display import Audio\n",
-    "# play audio\n",
+    "audio_array = codec_decode(x_fine_gen)\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
    "Audio(audio_array, rate=SAMPLE_RATE)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from scipy.io.wavfile import write as write_wav\n",
-    "# save audio\n",
-    "filepath = \"/output/audio.wav\" # change this to your desired output path\n",
-    "write_wav(filepath, SAMPLE_RATE, audio_array)"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,