new TTS model, better prompt

2026-04-03 09:46:45 +02:00 · 2024-04-21 11:10:14 -05:00
parent 13e52470c3
commit ce39ca89c1
5 changed files with 39 additions and 48 deletions
--- a/inference_tts.ipynb
+++ b/inference_tts.ipynb
@@ -71,7 +71,7 @@
    "# load model, encodec, and phn2num\n",
    "# # load model, tokenizer, and other necessary files\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-    "voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
+    "voicecraft_name=\"830M_TTSEnhanced.pth\" # or giga330M.pth, gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
    "\n",
    "# the new way of loading the model, with huggingface, recommended\n",
    "from models import voicecraft\n",
@@ -111,8 +111,8 @@
    "# Prepare your audio\n",
    "# point to the original audio whose speech you want to clone\n",
    "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
-    "orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
-    "orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
+    "orig_audio = \"./demo/5895_34622_000026_000002.wav\"\n",
+    "orig_transcript = \"Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather.\"\n",
    "\n",
    "# move the audio and transcript to temp folder\n",
    "temp_folder = \"./demo/temp\"\n",
@@ -143,8 +143,8 @@
   "outputs": [],
   "source": [
    "# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
-    "cut_off_sec = 3.01 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/84_121550_000074_000000.csv, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
-    "target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
+    "cut_off_sec = 3.6 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word \"strength\" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio\n",
+    "target_transcript = \"Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!\"\n",
    "# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.\n",
    "audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
    "info = torchaudio.info(audio_fn)\n",
@@ -165,7 +165,7 @@
    "\n",
    "# NOTE adjust the below three arguments if the generation is not as good\n",
    "stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
-    "sample_batch_size = 5 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
+    "sample_batch_size = 3 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 4 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
    "seed = 1 # change seed if you are still unhappy with the result\n",
    "\n",
    "def seed_everything(seed):\n",