Add RVC support

2025-12-16 03:38:01 +01:00 · 2023-07-19 19:12:27 -06:00
parent ff1e45fcb8
commit c87b3c81fb
10 changed files with 764 additions and 364 deletions
--- a/generate_chunked.ipynb
+++ b/generate_chunked.ipynb
@@ -6,9 +6,29 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "\n",
    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n",
+    "coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n",
+    "fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n",
+    "use_rvc = True # Set to False to use bark without RVC\n",
+    "rvc_name = 'mi-test'\n",
+    "rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n",
+    "index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\" \n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -139,6 +159,33 @@
    "# - `♪` for song lyrics"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download and load all models\n",
+    "preload_models(\n",
+    "    text_use_gpu=True,\n",
+    "    text_use_small=False,\n",
+    "    text_model_path=semantic_path,\n",
+    "    coarse_use_gpu=True,\n",
+    "    coarse_use_small=False,\n",
+    "    coarse_model_path=coarse_path,\n",
+    "    fine_use_gpu=True,\n",
+    "    fine_use_small=False,\n",
+    "    fine_model_path=fine_path,\n",
+    "    codec_use_gpu=True,\n",
+    "    force_reload=False,\n",
+    "    path=\"models\"\n",
+    ")\n",
+    "\n",
+    "if use_rvc:\n",
+    "    from rvc_infer import get_vc, vc_single\n",
+    "    get_vc(rvc_path, device, is_half)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -180,26 +227,6 @@
    "In conclusion, the human journey is one of discovery, driven by our innate curiosity and desire to understand the world around us. From the dawn of our species to the present day, we have continued to explore, learn, and adapt, pushing the boundaries of what is known and possible. As we continue to unravel the mysteries of the cosmos, our spirit.\"\"\""
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# download and load all models\n",
-    "preload_models(\n",
-    "    text_use_gpu=True,\n",
-    "    text_use_small=False,\n",
-    "    coarse_use_gpu=True,\n",
-    "    coarse_use_small=False,\n",
-    "    fine_use_gpu=True,\n",
-    "    fine_use_small=False,\n",
-    "    codec_use_gpu=True,\n",
-    "    force_reload=False,\n",
-    "    path=\"models\"\n",
-    ")"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -215,7 +242,7 @@
    "import numpy as np\n",
    "\n",
    "# generation settings\n",
-    "voice_name = 'speaker_4'\n",
+    "voice_name = 'en_speaker_0'\n",
    "out_filepath = 'audio/audio.wav'\n",
    "\n",
    "semantic_temp = 0.7\n",
@@ -234,6 +261,15 @@
    "\n",
    "use_last_generation_as_history = True\n",
    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -10\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "\n",
    "texts = split_and_recombine_text(text)\n",
    "\n",
    "all_parts = []\n",
@@ -263,6 +299,14 @@
    "            fine_prompt=full_generation['fine_prompt'],\n",
    "        )\n",
    "        voice_name = '_temp/history.npz'\n",
+    "    write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "    if use_rvc:\n",
+    "        try:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        except:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
    "    all_parts.append(audio_array)\n",
    "\n",
    "audio_array = np.concatenate(all_parts, axis=-1)\n",