mirror of
https://github.com/jasonppy/VoiceCraft.git
synced 2026-04-03 09:46:45 +02:00
new TTS model, better prompt
This commit is contained in:
@@ -71,7 +71,7 @@
|
||||
"# load model, encodec, and phn2num\n",
|
||||
"# # load model, tokenizer, and other necessary files\n",
|
||||
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
|
||||
"voicecraft_name=\"830M_TTSEnhanced.pth\" # or giga330M.pth, gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
|
||||
"\n",
|
||||
"# the new way of loading the model, with huggingface, recommended\n",
|
||||
"from models import voicecraft\n",
|
||||
@@ -111,8 +111,8 @@
|
||||
"# Prepare your audio\n",
|
||||
"# point to the original audio whose speech you want to clone\n",
|
||||
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
|
||||
"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
|
||||
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
|
||||
"orig_audio = \"./demo/5895_34622_000026_000002.wav\"\n",
|
||||
"orig_transcript = \"Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather.\"\n",
|
||||
"\n",
|
||||
"# move the audio and transcript to temp folder\n",
|
||||
"temp_folder = \"./demo/temp\"\n",
|
||||
@@ -143,8 +143,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
|
||||
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/84_121550_000074_000000.csv, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
|
||||
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
|
||||
"cut_off_sec = 3.6 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word \"strength\" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio\n",
|
||||
"target_transcript = \"Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!\"\n",
|
||||
"# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.\n",
|
||||
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
|
||||
"info = torchaudio.info(audio_fn)\n",
|
||||
@@ -165,7 +165,7 @@
|
||||
"\n",
|
||||
"# NOTE adjust the below three arguments if the generation is not as good\n",
|
||||
"stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
|
||||
"sample_batch_size = 5 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
|
||||
"sample_batch_size = 3 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 4 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
|
||||
"seed = 1 # change seed if you are still unhappy with the result\n",
|
||||
"\n",
|
||||
"def seed_everything(seed):\n",
|
||||
|
||||
Reference in New Issue
Block a user