Add RVC support

2025-12-14 18:57:56 +01:00 · 2023-07-19 19:12:27 -06:00
parent ff1e45fcb8
commit c87b3c81fb
10 changed files with 764 additions and 364 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,8 @@ models/
 wandb/
 *_output/
 output.npz
-joe_biden_state_of_union/
+joe_biden_state_of_union/
+Retrieval-based-Voice-Conversion-WebUI/
+devin-youtube/
+train_rvc.ipynb
+*.pt
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -420,7 +420,10 @@ def generate_text_semantic(
    assert len(text.strip()) > 0
    if history_prompt is not None:
        if history_prompt.endswith(".npz"):
-            semantic_history = np.load(history_prompt)["semantic_prompt"]
+            try:
+                semantic_history = np.load(history_prompt)["semantic_prompt"]
+            except:
+                semantic_history = np.load(history_prompt)["semantic"]
        else:
            semantic_history = np.load(
                os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
@@ -592,8 +595,12 @@ def generate_coarse(
            x_history = np.load(
                os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
            )
-        x_semantic_history = x_history["semantic_prompt"]
-        x_coarse_history = x_history["coarse_prompt"]
+        try:
+            x_semantic_history = x_history["semantic_prompt"]
+            x_coarse_history = x_history["coarse_prompt"]
+        except:
+            x_semantic_history = x_history["semantic"]
+            x_coarse_history = x_history["coarse"]
        assert (
            isinstance(x_semantic_history, np.ndarray)
            and len(x_semantic_history.shape) == 1
@@ -750,7 +757,10 @@ def generate_fine(
    )
    if history_prompt is not None:
        if history_prompt.endswith(".npz"):
-            x_fine_history = np.load(history_prompt)["fine_prompt"]
+            try:
+                x_fine_history = np.load(history_prompt)["fine_prompt"]
+            except:
+                x_fine_history = np.load(history_prompt)["fine"]
        else:
            x_fine_history = np.load(
                os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt}.npz")
--- a/generate.ipynb
+++ b/generate.ipynb
@@ -2,25 +2,37 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "\n",
    "from bark.api import generate_audio\n",
-    "from transformers import BertTokenizer\n",
-    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic\n",
-    "\n",
-    "# Enter your prompt and speaker here\n",
-    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
-    "voice_name = \"speaker_0\" # use your custom voice name here if you have one\n",
-    "\n",
-    "# load the tokenizer\n",
-    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")"
+    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n",
+    "coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n",
+    "fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n",
+    "use_rvc = True # Set to False to use bark without RVC\n",
+    "rvc_name = 'mi-test'\n",
+    "rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n",
+    "index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\"\n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -28,14 +40,21 @@
    "preload_models(\n",
    "    text_use_gpu=True,\n",
    "    text_use_small=False,\n",
+    "    text_model_path=semantic_path,\n",
    "    coarse_use_gpu=True,\n",
    "    coarse_use_small=False,\n",
+    "    coarse_model_path=coarse_path,\n",
    "    fine_use_gpu=True,\n",
    "    fine_use_small=False,\n",
+    "    fine_model_path=fine_path,\n",
    "    codec_use_gpu=True,\n",
    "    force_reload=False,\n",
    "    path=\"models\"\n",
-    ")"
+    ")\n",
+    "\n",
+    "if use_rvc:\n",
+    "    from rvc_infer import get_vc, vc_single\n",
+    "    get_vc(rvc_path, device, is_half)"
   ]
  },
  {
@@ -45,7 +64,28 @@
   "outputs": [],
   "source": [
    "# simple generation\n",
-    "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)"
+    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
+    "voice_name = \"speaker_0\" # use your custom voice name here if you have on\n",
+    "\n",
+    "filepath = \"output/audio.wav\"\n",
+    "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
   ]
  },
  {
@@ -55,6 +95,11 @@
   "outputs": [],
   "source": [
    "# generation with more control\n",
+    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
+    "voice_name = \"speaker_0\" # use your custom voice name here if you have on\n",
+    "\n",
+    "filepath = \"output/audio.wav\"\n",
+    "\n",
    "x_semantic = generate_text_semantic(\n",
    "    text_prompt,\n",
    "    history_prompt=voice_name,\n",
@@ -75,32 +120,26 @@
    "    history_prompt=voice_name,\n",
    "    temp=0.5,\n",
    ")\n",
-    "audio_array = codec_decode(x_fine_gen)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.display import Audio\n",
-    "# play audio\n",
+    "audio_array = codec_decode(x_fine_gen)\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
    "Audio(audio_array, rate=SAMPLE_RATE)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from scipy.io.wavfile import write as write_wav\n",
-    "# save audio\n",
-    "filepath = \"/output/audio.wav\" # change this to your desired output path\n",
-    "write_wav(filepath, SAMPLE_RATE, audio_array)"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/generate_chunked.ipynb
+++ b/generate_chunked.ipynb
@@ -6,9 +6,29 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "\n",
    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n",
+    "coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n",
+    "fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n",
+    "use_rvc = True # Set to False to use bark without RVC\n",
+    "rvc_name = 'mi-test'\n",
+    "rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n",
+    "index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\" \n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -139,6 +159,33 @@
    "# - `♪` for song lyrics"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download and load all models\n",
+    "preload_models(\n",
+    "    text_use_gpu=True,\n",
+    "    text_use_small=False,\n",
+    "    text_model_path=semantic_path,\n",
+    "    coarse_use_gpu=True,\n",
+    "    coarse_use_small=False,\n",
+    "    coarse_model_path=coarse_path,\n",
+    "    fine_use_gpu=True,\n",
+    "    fine_use_small=False,\n",
+    "    fine_model_path=fine_path,\n",
+    "    codec_use_gpu=True,\n",
+    "    force_reload=False,\n",
+    "    path=\"models\"\n",
+    ")\n",
+    "\n",
+    "if use_rvc:\n",
+    "    from rvc_infer import get_vc, vc_single\n",
+    "    get_vc(rvc_path, device, is_half)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -180,26 +227,6 @@
    "In conclusion, the human journey is one of discovery, driven by our innate curiosity and desire to understand the world around us. From the dawn of our species to the present day, we have continued to explore, learn, and adapt, pushing the boundaries of what is known and possible. As we continue to unravel the mysteries of the cosmos, our spirit.\"\"\""
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# download and load all models\n",
-    "preload_models(\n",
-    "    text_use_gpu=True,\n",
-    "    text_use_small=False,\n",
-    "    coarse_use_gpu=True,\n",
-    "    coarse_use_small=False,\n",
-    "    fine_use_gpu=True,\n",
-    "    fine_use_small=False,\n",
-    "    codec_use_gpu=True,\n",
-    "    force_reload=False,\n",
-    "    path=\"models\"\n",
-    ")"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -215,7 +242,7 @@
    "import numpy as np\n",
    "\n",
    "# generation settings\n",
-    "voice_name = 'speaker_4'\n",
+    "voice_name = 'en_speaker_0'\n",
    "out_filepath = 'audio/audio.wav'\n",
    "\n",
    "semantic_temp = 0.7\n",
@@ -234,6 +261,15 @@
    "\n",
    "use_last_generation_as_history = True\n",
    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -10\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "\n",
    "texts = split_and_recombine_text(text)\n",
    "\n",
    "all_parts = []\n",
@@ -263,6 +299,14 @@
    "            fine_prompt=full_generation['fine_prompt'],\n",
    "        )\n",
    "        voice_name = '_temp/history.npz'\n",
+    "    write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "    if use_rvc:\n",
+    "        try:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        except:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
    "    all_parts.append(audio_array)\n",
    "\n",
    "audio_array = np.concatenate(all_parts, axis=-1)\n",
--- a/rvc_infer.py
+++ b/rvc_infer.py
@@ -0,0 +1,169 @@
+import os,sys,pdb,torch
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+import argparse
+import glob
+import sys
+import torch
+from multiprocessing import cpu_count
+import ffmpeg
+import numpy as np
+
+
+def load_audio(file, sr):
+    try:
+        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        file = (
+            file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        )  # 防止小白拷路径头尾带了空格和"和回车
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to load audio: {e}")
+
+    return np.frombuffer(out, np.float32).flatten()
+
+
+class Config:
+    def __init__(self,device,is_half):
+        self.device = device
+        self.is_half = is_half
+        self.n_cpu = 0
+        self.gpu_name = None
+        self.gpu_mem = None
+        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
+
+    def device_config(self) -> tuple:
+        if torch.cuda.is_available():
+            i_device = int(self.device.split(":")[-1])
+            self.gpu_name = torch.cuda.get_device_name(i_device)
+            if (
+                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
+                or "P40" in self.gpu_name.upper()
+                or "1060" in self.gpu_name
+                or "1070" in self.gpu_name
+                or "1080" in self.gpu_name
+            ):
+                print("16系/10系显卡和P40强制单精度")
+                self.is_half = False
+                for config_file in ["32k.json", "40k.json", "48k.json"]:
+                    with open(f"configs/{config_file}", "r") as f:
+                        strr = f.read().replace("true", "false")
+                    with open(f"configs/{config_file}", "w") as f:
+                        f.write(strr)
+                with open("trainset_preprocess_pipeline_print.py", "r") as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open("trainset_preprocess_pipeline_print.py", "w") as f:
+                    f.write(strr)
+            else:
+                self.gpu_name = None
+            self.gpu_mem = int(
+                torch.cuda.get_device_properties(i_device).total_memory
+                / 1024
+                / 1024
+                / 1024
+                + 0.4
+            )
+            if self.gpu_mem <= 4:
+                with open("trainset_preprocess_pipeline_print.py", "r") as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open("trainset_preprocess_pipeline_print.py", "w") as f:
+                    f.write(strr)
+        elif torch.backends.mps.is_available():
+            print("没有发现支持的N卡, 使用MPS进行推理")
+            self.device = "mps"
+        else:
+            print("没有发现支持的N卡, 使用CPU进行推理")
+            self.device = "cpu"
+            self.is_half = True
+
+        if self.n_cpu == 0:
+            self.n_cpu = cpu_count()
+
+        if self.is_half:
+            # 6G显存配置
+            x_pad = 3
+            x_query = 10
+            x_center = 60
+            x_max = 65
+        else:
+            # 5G显存配置
+            x_pad = 1
+            x_query = 6
+            x_center = 38
+            x_max = 41
+
+        if self.gpu_mem != None and self.gpu_mem <= 4:
+            x_pad = 1
+            x_query = 5
+            x_center = 30
+            x_max = 32
+
+        return x_pad, x_query, x_center, x_max
+
+
+now_dir=os.getcwd()
+sys.path.append(now_dir)
+sys.path.append(os.path.join(now_dir,"Retrieval-based-Voice-Conversion-WebUI"))
+from vc_infer_pipeline import VC
+from lib.infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono
+from fairseq import checkpoint_utils
+from scipy.io import wavfile
+
+hubert_model=None
+def load_hubert():
+    global hubert_model
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",)
+    hubert_model = models[0]
+    hubert_model = hubert_model.to(device)
+    if(is_half):hubert_model = hubert_model.half()
+    else:hubert_model = hubert_model.float()
+    hubert_model.eval()
+
+def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,index_rate,filter_radius=3,resample_sr=48000,rms_mix_rate=0.25, protect=0.33):
+    global tgt_sr,net_g,vc,hubert_model
+    if input_audio is None:return "You need to upload an audio", None
+    f0_up_key = int(f0_up_key)
+    audio=load_audio(input_audio,16000)
+    times = [0, 0, 0]
+    if(hubert_model==None):load_hubert()
+    if_f0 = cpt.get("f0", 1)
+    version = cpt.get("version")
+    audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,input_audio,times,f0_up_key,f0_method,file_index,index_rate,if_f0,filter_radius=filter_radius,tgt_sr=tgt_sr,resample_sr=resample_sr,rms_mix_rate=rms_mix_rate,version=version,protect=protect,f0_file=f0_file)
+    # print(times)
+    return audio_opt
+
+
+def get_vc(model_path, device_, is_half_):
+    global n_spk,tgt_sr,net_g,vc,cpt,device,is_half
+    device = device_
+    is_half = is_half_
+    config = Config(device, is_half)
+    print("loading pth %s"%model_path)
+    cpt = torch.load(model_path, map_location="cpu")
+    tgt_sr = cpt["config"][-1]
+    cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk
+    if_f0=cpt.get("f0",1)
+    version=cpt.get("version", "v2")
+    if(if_f0==1):
+        if version == "v1":
+            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
+        else:
+            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
+    else:
+        if version == "v1":
+            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+        else:
+            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+    del net_g.enc_q
+    print(net_g.load_state_dict(cpt["weight"], strict=False))  # 不加这一行清不干净，真奇葩
+    net_g.eval().to(device)
+    if (is_half):net_g = net_g.half()
+    else:net_g = net_g.float()
+    vc = VC(tgt_sr, config)
+    n_spk=cpt["config"][-3]
--- a/rvc_test.ipynb
+++ b/rvc_test.ipynb
@@ -0,0 +1,85 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rvc_infer import get_vc, vc_single"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = \"Retrieval-based-Voice-Conversion-WebUI/weights/mi-test.pth\"\n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_vc(model_path, device, is_half)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_rate = 0.75\n",
+    "f0up_key = -6\n",
+    "filter_radius = 3\n",
+    "rms_mix_rate = 0.25\n",
+    "protect = 0.33\n",
+    "resample_sr = 48000\n",
+    "f0method = \"harvest\" #harvest or pm\n",
+    "input_path = \"output/audio.wav\"\n",
+    "index_path = \"Retrieval-based-Voice-Conversion-WebUI/logs/mi-test/added_IVF256_Flat_nprobe_1_mi-test_v2.index\"\n",
+    "\n",
+    "wav_opt = vc_single(0,input_path,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "# play audio\n",
+    "Audio(wav_opt, rate=48000)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/test_models.ipynb
+++ b/test_models.ipynb
@@ -6,6 +6,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "\n",
    "from bark.api import generate_audio\n",
    "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic"
   ]
@@ -16,9 +19,15 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "semantic_path = \"E:/Python/bark-with-voice-clone/semantic_output/pytorch_model.bin\"\n",
-    "coarse_path = \"E:/Python/bark-with-voice-clone/coarse_output/pytorch_model.bin\"\n",
-    "fine_path = \"E:/Python/bark-with-voice-clone/fine_output/pytorch_model.bin\""
+    "semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n",
+    "coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n",
+    "fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n",
+    "use_rvc = True # Set to False to use bark without RVC\n",
+    "rvc_name = 'mi-test'\n",
+    "rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n",
+    "index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\"\n",
+    "device=\"cuda:0\"\n",
+    "is_half=True"
   ]
  },
  {
@@ -40,7 +49,11 @@
    "    codec_use_gpu=True,\n",
    "    force_reload=False,\n",
    "    path=\"models\"\n",
-    ")"
+    ")\n",
+    "\n",
+    "if use_rvc:\n",
+    "    from rvc_infer import get_vc, vc_single\n",
+    "    get_vc(rvc_path, device, is_half)"
   ]
  },
  {
@@ -50,31 +63,26 @@
   "outputs": [],
   "source": [
    "# simple generation\n",
-    "text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! [laughs] A lot better than the original!\"\n",
-    "audio_array = generate_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.display import Audio\n",
-    "# play audio\n",
-    "Audio(audio_array, rate=SAMPLE_RATE)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from scipy.io.wavfile import write as write_wav\n",
-    "# save audio\n",
+    "text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! A lot better than the original!\"\n",
    "filepath = \"output/audio.wav\" # change this to your desired output path\n",
-    "write_wav(filepath, SAMPLE_RATE, audio_array)"
+    "audio_array = generate_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7)\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
   ]
  },
  {
@@ -122,7 +130,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! [laughs] A lot better than the original!\"\n",
+    "text_prompt = \"I am Joe Biden... and this is the finetuned semantic, coarse and fine model! A lot better than the original!\"\n",
+    "filepath = \"output/audio.wav\" # change this to your desired output path\n",
    "\n",
    "audio_array = generate_with_settings(\n",
    "    text_prompt,\n",
@@ -131,16 +140,283 @@
    "    semantic_top_p=0.99,\n",
    "    coarse_temp=0.7,\n",
    "    coarse_top_k=50,\n",
-    "    coarse_top_p=0.99,\n",
+    "    coarse_top_p=0.95,\n",
    "    fine_temp=0.5,\n",
-    "    voice_name=None,\n",
-    "    use_semantic_history_prompt=True,\n",
+    "    voice_name=\"datasets/joe_biden_state_of_union/tokens/257.npz\",\n",
+    "    use_semantic_history_prompt=False,\n",
    "    use_coarse_history_prompt=True,\n",
    "    use_fine_history_prompt=True,\n",
    "    output_full=False\n",
    ")\n",
    "\n",
+    "write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "    try:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    except:\n",
+    "        audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "    write_wav(filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "Audio(audio_array, rate=SAMPLE_RATE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "def split_and_recombine_text(text, desired_length=100, max_length=150):\n",
+    "    # from https://github.com/neonbjb/tortoise-tts\n",
+    "    \"\"\"Split text it into chunks of a desired length trying to keep sentences intact.\"\"\"\n",
+    "    # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii\n",
+    "    text = re.sub(r\"\\n\\n+\", \"\\n\", text)\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    text = re.sub(r\"[“”]\", '\"', text)\n",
+    "\n",
+    "    rv = []\n",
+    "    in_quote = False\n",
+    "    current = \"\"\n",
+    "    split_pos = []\n",
+    "    pos = -1\n",
+    "    end_pos = len(text) - 1\n",
+    "\n",
+    "    def seek(delta):\n",
+    "        nonlocal pos, in_quote, current\n",
+    "        is_neg = delta < 0\n",
+    "        for _ in range(abs(delta)):\n",
+    "            if is_neg:\n",
+    "                pos -= 1\n",
+    "                current = current[:-1]\n",
+    "            else:\n",
+    "                pos += 1\n",
+    "                current += text[pos]\n",
+    "            if text[pos] == '\"':\n",
+    "                in_quote = not in_quote\n",
+    "        return text[pos]\n",
+    "\n",
+    "    def peek(delta):\n",
+    "        p = pos + delta\n",
+    "        return text[p] if p < end_pos and p >= 0 else \"\"\n",
+    "\n",
+    "    def commit():\n",
+    "        nonlocal rv, current, split_pos\n",
+    "        rv.append(current)\n",
+    "        current = \"\"\n",
+    "        split_pos = []\n",
+    "\n",
+    "    while pos < end_pos:\n",
+    "        c = seek(1)\n",
+    "        # do we need to force a split?\n",
+    "        if len(current) >= max_length:\n",
+    "            if len(split_pos) > 0 and len(current) > (desired_length / 2):\n",
+    "                # we have at least one sentence and we are over half the desired length, seek back to the last split\n",
+    "                d = pos - split_pos[-1]\n",
+    "                seek(-d)\n",
+    "            else:\n",
+    "                # no full sentences, seek back until we are not in the middle of a word and split there\n",
+    "                while c not in \"!?.\\n \" and pos > 0 and len(current) > desired_length:\n",
+    "                    c = seek(-1)\n",
+    "            commit()\n",
+    "        # check for sentence boundaries\n",
+    "        elif not in_quote and (c in \"!?\\n\" or (c == \".\" and peek(1) in \"\\n \")):\n",
+    "            # seek forward if we have consecutive boundary markers but still within the max length\n",
+    "            while (\n",
+    "                pos < len(text) - 1 and len(current) < max_length and peek(1) in \"!?.\"\n",
+    "            ):\n",
+    "                c = seek(1)\n",
+    "            split_pos.append(pos)\n",
+    "            if len(current) >= desired_length:\n",
+    "                commit()\n",
+    "        # treat end of quote as a boundary if its followed by a space or newline\n",
+    "        elif in_quote and peek(1) == '\"' and peek(2) in \"\\n \":\n",
+    "            seek(2)\n",
+    "            split_pos.append(pos)\n",
+    "    rv.append(current)\n",
+    "\n",
+    "    # clean up, remove lines with only whitespace or punctuation\n",
+    "    rv = [s.strip() for s in rv]\n",
+    "    rv = [s for s in rv if len(s) > 0 and not re.match(r\"^[\\s\\.,;:!?]*$\", s)]\n",
+    "\n",
+    "    return rv\n",
+    "\n",
+    "def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):\n",
+    "    # generation with more control\n",
+    "    x_semantic = generate_text_semantic(\n",
+    "        text_prompt,\n",
+    "        history_prompt=voice_name if use_semantic_history_prompt else None,\n",
+    "        temp=semantic_temp,\n",
+    "        top_k=semantic_top_k,\n",
+    "        top_p=semantic_top_p,\n",
+    "    )\n",
+    "\n",
+    "    x_coarse_gen = generate_coarse(\n",
+    "        x_semantic,\n",
+    "        history_prompt=voice_name if use_coarse_history_prompt else None,\n",
+    "        temp=coarse_temp,\n",
+    "        top_k=coarse_top_k,\n",
+    "        top_p=coarse_top_p,\n",
+    "    )\n",
+    "    x_fine_gen = generate_fine(\n",
+    "        x_coarse_gen,\n",
+    "        history_prompt=voice_name if use_fine_history_prompt else None,\n",
+    "        temp=fine_temp,\n",
+    "    )\n",
+    "\n",
+    "    if output_full:\n",
+    "        full_generation = {\n",
+    "            'semantic_prompt': x_semantic,\n",
+    "            'coarse_prompt': x_coarse_gen,\n",
+    "            'fine_prompt': x_fine_gen,\n",
+    "        }\n",
+    "        return full_generation, codec_decode(x_fine_gen)\n",
+    "    return codec_decode(x_fine_gen)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"\"\"The Uncharted Land of Discovery: A Journey Through Time and Space\n",
+    "[clears throat]\n",
+    "Chapter 1: The Dawn of Curiosity\n",
+    "[takes breath]\n",
+    "Since the dawn of humankind, our species has been driven by a powerful force: curiosity. It is an innate, unquenchable desire to explore, understand, and unravel the mysteries of the world around us. This primal urge has led us on countless adventures, pushing us to the farthest reaches of our planet and beyond.\n",
+    "\n",
+    "Early humans, huddled around a flickering fire, gazed up at the night sky and wondered what those twinkling lights were. They had no idea that their curiosity would eventually propel us into the vast, uncharted realm of space. As time progressed, our ancestors began to explore their surroundings, venturing beyond their caves and settlements, driven by the need to discover what lay beyond the horizon.\n",
+    "\n",
+    "hapter 2: The Age of Exploration\n",
+    "\n",
+    "The Age of Exploration marked a turning point in human history, as brave souls took to the seas in search of new lands, wealth, and knowledge. Pioneers like Christopher Columbus, Vasco da Gama, and Ferdinand Magellan set sail on perilous voyages, pushing the boundaries of what was known and understood.\n",
+    "[clears throat]\n",
+    "These intrepid explorers discovered new continents, mapped out previously unknown territories, and encountered diverse cultures. They also established trade routes, allowing for the exchange of goods, ideas, and innovations between distant societies. The Age of Exploration was not without its dark moments, however, as conquest, colonization, and exploitation often went hand in hand with discovery.\n",
+    "[clears throat]\n",
+    "Chapter 3: The Scientific Revolution\n",
+    "[laughs]\n",
+    "The Scientific Revolution was a period of profound change, as humanity began to question long-held beliefs and seek empirical evidence. Pioneers like Galileo Galilei, Isaac Newton, and Johannes Kepler sought to understand the natural world through observation, experimentation, and reason.\n",
+    "[sighs]\n",
+    "Their discoveries laid the foundation for modern science, transforming the way we view the universe and our place within it. New technologies, such as the telescope and the microscope, allowed us to peer deeper into the cosmos and the microscopic world, further expanding our understanding of reality.\n",
+    "[gasps]\n",
+    "Chapter 4: The Information Age\n",
+    "\n",
+    "The Information Age, sometimes referred to as the Digital Age, has revolutionized the way we communicate, learn, and access knowledge. With the advent of the internet and personal computers, information that was once reserved for the privileged few is now available to the masses.\n",
+    "...\n",
+    "This democratization of knowledge has led to an explosion of innovation, as ideas and information are shared across borders and cultures at lightning speed. The Information Age has also brought new challenges, as the rapid pace of technological advancements threatens to outpace our ability to adapt and raises questions about the ethical implications of our increasingly interconnected world.\n",
+    "[laughter]\n",
+    "Chapter 5: The Final Frontier\n",
+    "[clears throat]\n",
+    "As our knowledge of the universe expands, so too does our desire to explore the cosmos. Space exploration has come a long way since the first successful satellite, Sputnik, was launched in 1957. We have landed humans on the moon, sent probes to the far reaches of our solar system, and even glimpsed distant galaxies through powerful telescopes.\n",
+    "\n",
+    "The future of space exploration is filled with possibilities, from establishing colonies on Mars to the search for extraterrestrial life. As we venture further into the unknown, we continue to be driven by the same curiosity that has propelled us throughout history, always seeking to uncover the secrets of the universe and our place within it.\n",
+    "...\n",
+    "In conclusion, the human journey is one of discovery, driven by our innate curiosity and desire to understand the world around us. From the dawn of our species to the present day, we have continued to explore, learn, and adapt, pushing the boundaries of what is known and possible. As we continue to unravel the mysteries of the cosmos, our spirit.\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Chunk the text into smaller pieces then combine the generated audio\n",
+    "from time import time\n",
+    "from tqdm.auto import tqdm\n",
    "from IPython.display import Audio\n",
+    "from scipy.io.wavfile import write as write_wav\n",
+    "import os\n",
+    "import numpy as np\n",
+    "\n",
+    "# generation settings\n",
+    "voice_name = \"datasets/joe_biden_state_of_union/tokens/257.npz\"\n",
+    "out_filepath = 'audio/audio.wav'\n",
+    "\n",
+    "semantic_temp = 0.7\n",
+    "semantic_top_k = 100\n",
+    "semantic_top_p = 0.99\n",
+    "\n",
+    "coarse_temp = 0.7\n",
+    "coarse_top_k = 100\n",
+    "coarse_top_p = 0.95\n",
+    "\n",
+    "fine_temp = 0.7\n",
+    "\n",
+    "use_semantic_history_prompt = True\n",
+    "use_coarse_history_prompt = True\n",
+    "use_fine_history_prompt = True\n",
+    "\n",
+    "use_last_generation_as_history = False\n",
+    "\n",
+    "if use_rvc:\n",
+    "    index_rate = 0.75\n",
+    "    f0up_key = -6\n",
+    "    filter_radius = 3\n",
+    "    rms_mix_rate = 0.25\n",
+    "    protect = 0.33\n",
+    "    resample_sr = SAMPLE_RATE\n",
+    "    f0method = \"harvest\" #harvest or pm\n",
+    "\n",
+    "texts = split_and_recombine_text(text)\n",
+    "\n",
+    "all_parts = []\n",
+    "for i, text in tqdm(enumerate(texts), total=len(texts)):\n",
+    "    full_generation, audio_array = generate_with_settings(\n",
+    "        text,\n",
+    "        semantic_temp=semantic_temp,\n",
+    "        semantic_top_k=semantic_top_k,\n",
+    "        semantic_top_p=semantic_top_p,\n",
+    "        coarse_temp=coarse_temp,\n",
+    "        coarse_top_k=coarse_top_k,\n",
+    "        coarse_top_p=coarse_top_p,\n",
+    "        fine_temp=fine_temp,\n",
+    "        voice_name=voice_name,\n",
+    "        use_semantic_history_prompt=use_semantic_history_prompt,\n",
+    "        use_coarse_history_prompt=use_coarse_history_prompt,\n",
+    "        use_fine_history_prompt=use_fine_history_prompt,\n",
+    "        output_full=True\n",
+    "    )\n",
+    "    if use_last_generation_as_history:\n",
+    "        # save to npz\n",
+    "        os.makedirs('_temp', exist_ok=True)\n",
+    "        np.savez_compressed(\n",
+    "            '_temp/history.npz',\n",
+    "            semantic_prompt=full_generation['semantic_prompt'],\n",
+    "            coarse_prompt=full_generation['coarse_prompt'],\n",
+    "            fine_prompt=full_generation['fine_prompt'],\n",
+    "        )\n",
+    "        voice_name = '_temp/history.npz'\n",
+    "    write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
+    "\n",
+    "    if use_rvc:\n",
+    "        try:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        except:\n",
+    "            audio_array = vc_single(0,out_filepath.replace('.wav', f'_{i}') + '.wav',f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n",
+    "        write_wav(out_filepath.replace('.wav', f'_{i}') + '.wav', SAMPLE_RATE, audio_array)\n",
+    "    all_parts.append(audio_array)\n",
+    "\n",
+    "audio_array = np.concatenate(all_parts, axis=-1)\n",
+    "\n",
+    "# save audio\n",
+    "write_wav(out_filepath, SAMPLE_RATE, audio_array)\n",
+    "\n",
    "# play audio\n",
    "Audio(audio_array, rate=SAMPLE_RATE)"
   ]
@@ -150,12 +426,7 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": [
-    "from scipy.io.wavfile import write as write_wav\n",
-    "# save audio\n",
-    "filepath = \"output/audio.wav\" # change this to your desired output path\n",
-    "write_wav(filepath, SAMPLE_RATE, audio_array)"
-   ]
+   "source": []
  }
 ],
 "metadata": {
--- a/train_coarse.ipynb
+++ b/train_coarse.ipynb
@@ -57,7 +57,7 @@
   "source": [
    "train_batch_size = 8\n",
    "eval_batch_size = 8\n",
-    "grad_accum = 1\n",
+    "grad_accum = 2\n",
    "ckpt_path = 'models/coarse_2.pt'\n",
    "model_type = \"coarse\"\n",
    "dataset_path = 'datasets/joe_biden_state_of_union/'\n",
@@ -81,7 +81,7 @@
    "lora_scaling = 1\n",
    "lora_dropout = 0.1\n",
    "lora_module_name = 'transformer.h'\n",
-    "optimize_lora_params_only = True\n",
+    "optimize_lora_params_only = False\n",
    "\n",
    "learning_rate = 1e-4\n",
    "scale_lr = False\n",
@@ -95,12 +95,12 @@
    "keep_in_fp32_modules = ['lm_head']\n",
    "\n",
    "lr_scheduler_type = 'linear'\n",
-    "lr_warmup_steps = 200\n",
-    "num_train_epochs = 20\n",
+    "lr_warmup_steps = 60\n",
+    "num_train_epochs = 5\n",
    "max_train_steps = None\n",
    "max_grad_norm = 1.0\n",
    "\n",
-    "semantic_cross_entropy_loss_weight = 0\n",
+    "semantic_cross_entropy_loss_weight = 0.0\n",
    "\n",
    "seed = 741"
   ]
@@ -286,7 +286,7 @@
    "\n",
    "\n",
    "def load_filepaths_and_text(filename, split=\"|\"):\n",
-    "    with open(filename, encoding='utf-8') as f:\n",
+    "    with open(filename, encoding='utf-8', errors='ignore') as f:\n",
    "        filepaths_and_text = [line.strip().split(split) for line in f]\n",
    "        base = os.path.dirname(filename)\n",
    "        for j in range(len(filepaths_and_text)):\n",
--- a/train_fine.ipynb
+++ b/train_fine.ipynb
@@ -10,56 +10,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:root:WARNING: Could not find module 'C:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\xformers\\_C.pyd' (or one of its dependencies). Try using the full path with constructor syntax.\n",
-      "Need to compile C++ extensions to get sparse attention suport. Please run python setup.py build develop\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Could not find module 'C:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\xformers\\_C.pyd' (or one of its dependencies). Try using the full path with constructor syntax.\n",
-      "\n",
-      "===================================BUG REPORT===================================\n",
-      "Welcome to bitsandbytes. For bug reports, please run\n",
-      "\n",
-      "python -m bitsandbytes\n",
-      "\n",
-      " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
-      "================================================================================\n",
-      "bin c:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\bitsandbytes\\libbitsandbytes_cuda116.dll\n",
-      "function 'cadam32bit_grad_fp32' not found\n",
-      "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
-      "CUDA SETUP: WARNING! libcuda.so not found! Do you have a CUDA driver installed? If you are on a cluster, make sure you are on a CUDA machine!\n",
-      "CUDA SETUP: Loading binary c:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\bitsandbytes\\libbitsandbytes_cuda116.dll...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\bitsandbytes\\cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
-      "  warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n",
-      "c:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\bitsandbytes\\cuda_setup\\main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {WindowsPath('vs/workbench/api/node/extensionHostProcess')}\n",
-      "  warn(msg)\n",
-      "c:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\bitsandbytes\\cuda_setup\\main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {WindowsPath('module'), WindowsPath('/matplotlib_inline.backend_inline')}\n",
-      "  warn(msg)\n",
-      "c:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\bitsandbytes\\cuda_setup\\main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {WindowsPath('/usr/local/cuda/lib64')}\n",
-      "  warn(msg)\n",
-      "c:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\bitsandbytes\\cuda_setup\\main.py:149: UserWarning: WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!\n",
-      "  warn(msg)\n",
-      "c:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\bitsandbytes\\cuda_setup\\main.py:149: UserWarning: WARNING: No GPU detected! Check your CUDA paths. Proceeding to load CPU-only library...\n",
-      "  warn(msg)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
@@ -98,13 +51,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_batch_size = 8\n",
    "eval_batch_size = 8\n",
-    "grad_accum = 1\n",
+    "grad_accum = 2\n",
    "ckpt_path = 'models/fine_2.pt'\n",
    "model_type = \"fine\"\n",
    "dataset_path = 'datasets/joe_biden_state_of_union/'\n",
@@ -128,7 +81,7 @@
    "lora_scaling = 1\n",
    "lora_dropout = 0.1\n",
    "lora_module_name = 'transformer.h'\n",
-    "optimize_lora_params_only = True\n",
+    "optimize_lora_params_only = False\n",
    "\n",
    "learning_rate = 1e-4\n",
    "scale_lr = False\n",
@@ -142,13 +95,11 @@
    "keep_in_fp32_modules = ['lm_head']\n",
    "\n",
    "lr_scheduler_type = 'linear'\n",
-    "lr_warmup_steps = 100\n",
+    "lr_warmup_steps = 60\n",
    "num_train_epochs = 5\n",
    "max_train_steps = None\n",
    "max_grad_norm = 1.0\n",
    "\n",
-    "semantic_cross_entropy_loss_weight = 0\n",
-    "\n",
    "seed = 741"
   ]
  },
@@ -162,18 +113,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\labou\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\accelerate\\accelerator.py:258: FutureWarning: `logging_dir` is deprecated and will be removed in version 0.18.0 of 🤗 Accelerate. Use `project_dir` instead.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "CONTEXT_WINDOW_SIZE = 1024\n",
    "\n",
@@ -342,7 +284,7 @@
    "\n",
    "\n",
    "def load_filepaths_and_text(filename, split=\"|\"):\n",
-    "    with open(filename, encoding='utf-8') as f:\n",
+    "    with open(filename, encoding='utf-8', errors='ignore') as f:\n",
    "        filepaths_and_text = [line.strip().split(split) for line in f]\n",
    "        base = os.path.dirname(filename)\n",
    "        for j in range(len(filepaths_and_text)):\n",
@@ -414,7 +356,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -529,24 +471,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loaded fine model with 302090240 params, val_loss=2.0786.\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "model = _load_model(ckpt_path, device, use_small=False, model_type=model_type)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -570,7 +504,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -626,7 +560,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -659,7 +593,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -678,7 +612,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -726,7 +660,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -744,67 +678,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mfrancislabounty\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "wandb version 0.15.4 is available!  To upgrade, please run:\n",
-       " $ pip install wandb --upgrade"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Tracking run with wandb version 0.13.6"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Run data is saved locally in <code>e:\\Python\\bark-with-voice-clone\\wandb\\run-20230629_202416-290ebk11</code>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Syncing run <strong><a href=\"https://wandb.ai/francislabounty/bark_coarse/runs/290ebk11\" target=\"_blank\">fresh-pyramid-26</a></strong> to <a href=\"https://wandb.ai/francislabounty/bark_coarse\" target=\"_blank\">Weights & Biases</a> (<a href=\"https://wandb.me/run\" target=\"_blank\">docs</a>)<br/>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# We need to recalculate our total training steps as the size of the training dataloader may have changed.\n",
    "num_update_steps_per_epoch = math.ceil(len(train_dataloader) / grad_accum)\n",
@@ -850,17 +726,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Validation Loss: 30.702054630626332 over 82 samples and 11 batches.\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "if accelerator.is_main_process:\n",
    "    model.eval()\n",
@@ -883,7 +751,7 @@
    "            loss_7 = criterion(logits_7.view(-1, model.config.output_vocab_size), fine_targets_7.view(-1))\n",
    "            loss_8 = criterion(logits_8.view(-1, model.config.output_vocab_size), fine_targets_8.view(-1))\n",
    "\n",
-    "            loss = loss_7 + loss_8\n",
+    "            loss = (loss_7 + loss_8) / 2\n",
    "            validation_loss += loss.item()\n",
    "            num_batches += 1\n",
    "            num_samples += val_batch['fine_tokens'].size(0)\n",
@@ -903,91 +771,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3761107b0c094d2db6532410a582408c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/205 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Waiting for W&B process to finish... <strong style=\"color:green\">(success).</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "45d15ef5bc1e4729aaf827aa3380823d",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\\r'), FloatProgress(value=1.0, max…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<style>\n",
-       "    table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
-       "    .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
-       "    .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
-       "    </style>\n",
-       "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>loss</td><td>█▆█▅▅▇▆▆▆▇▆▆▅▄▃▃▄▂▃▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁</td></tr><tr><td>lr</td><td>▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇███▇▇▇▆▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>loss</td><td>3.18231</td></tr><tr><td>lr</td><td>0.0</td></tr></table><br/></div></div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Synced <strong style=\"color:#cdcd00\">fresh-pyramid-26</strong>: <a href=\"https://wandb.ai/francislabounty/bark_coarse/runs/290ebk11\" target=\"_blank\">https://wandb.ai/francislabounty/bark_coarse/runs/290ebk11</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Find logs at: <code>.\\wandb\\run-20230629_202416-290ebk11\\logs</code>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Only show the progress bar once on each machine.\n",
    "progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)\n",
@@ -1016,7 +802,7 @@
    "            loss_7 = criterion(logits_7.view(-1, model.config.output_vocab_size), fine_targets_7.view(-1))\n",
    "            loss_8 = criterion(logits_8.view(-1, model.config.output_vocab_size), fine_targets_8.view(-1))\n",
    "\n",
-    "            loss = loss_7 + loss_8\n",
+    "            loss = (loss_7 + loss_8) / 2\n",
    "\n",
    "            accelerator.backward(loss)\n",
    "            if accelerator.sync_gradients:\n",
@@ -1072,17 +858,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Validation Loss: 3.041703635996038 over 82 samples and 11 batches.\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "if accelerator.is_main_process:\n",
    "    model.eval()\n",
@@ -1105,7 +883,7 @@
    "            loss_7 = criterion(logits_7.view(-1, model.config.output_vocab_size), fine_targets_7.view(-1))\n",
    "            loss_8 = criterion(logits_8.view(-1, model.config.output_vocab_size), fine_targets_8.view(-1))\n",
    "\n",
-    "            loss = loss_7 + loss_8\n",
+    "            loss = (loss_7 + loss_8) / 2\n",
    "            validation_loss += loss.item()\n",
    "            num_batches += 1\n",
    "            num_samples += val_batch['fine_tokens'].size(0)\n",
--- a/train_semantic.ipynb
+++ b/train_semantic.ipynb
@@ -57,7 +57,7 @@
   "source": [
    "train_batch_size = 8\n",
    "eval_batch_size = 8\n",
-    "grad_accum = 1\n",
+    "grad_accum = 2\n",
    "ckpt_path = 'models/text_2.pt'\n",
    "model_type = \"text\"\n",
    "dataset_path = 'datasets/joe_biden_state_of_union/'\n",
@@ -78,10 +78,10 @@
    "quant_type = 'nf4'\n",
    "\n",
    "lora_dim = 64\n",
-    "lora_scaling = 32\n",
+    "lora_scaling = 1\n",
    "lora_dropout = 0.1\n",
    "lora_module_name = 'transformer.h'\n",
-    "optimize_lora_params_only = True\n",
+    "optimize_lora_params_only = False\n",
    "\n",
    "learning_rate = 1e-4\n",
    "scale_lr = False\n",
@@ -95,8 +95,8 @@
    "keep_in_fp32_modules = ['lm_head']\n",
    "\n",
    "lr_scheduler_type = 'linear'\n",
-    "lr_warmup_steps = 200\n",
-    "num_train_epochs = 20\n",
+    "lr_warmup_steps = 60\n",
+    "num_train_epochs = 5\n",
    "max_train_steps = None\n",
    "max_grad_norm = 1.0\n",
    "\n",
@@ -266,7 +266,7 @@
    "\n",
    "\n",
    "def load_filepaths_and_text(filename, split=\"|\"):\n",
-    "    with open(filename, encoding='utf-8') as f:\n",
+    "    with open(filename, encoding='utf-8', errors='ignore') as f:\n",
    "        filepaths_and_text = [line.strip().split(split) for line in f]\n",
    "        base = os.path.dirname(filename)\n",
    "        for j in range(len(filepaths_and_text)):\n",
@@ -628,7 +628,7 @@
    "    collate_fn=TtsCollater(),\n",
    ")\n",
    "\n",
-    "criterion = torch.nn.CrossEntropyLoss(ignore_index=SEMANTIC_PAD_TOKEN)\n",
+    "criterion = torch.nn.CrossEntropyLoss() #ignore_index=SEMANTIC_PAD_TOKEN)\n",
    "\n",
    "# Scheduler and math around the number of training steps.\n",
    "overrode_max_train_steps = False\n",