{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# import libs\n",
"import torch\n",
"import torchaudio\n",
"\n",
"from data.tokenizer import (\n",
" AudioTokenizer,\n",
" TextTokenizer,\n",
")\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Setting up corpus information\u001b[33m...\u001b[0m \n",
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Loading corpus from source files\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/100 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[?25h"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Found \u001b[1;36m1\u001b[0m speaker across \u001b[1;36m1\u001b[0m file, average number of utterances per \n",
"\u001b[2;36m \u001b[0m speaker: \u001b[1;36m1.0\u001b[0m \n",
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Initializing multiprocessing jobs\u001b[33m...\u001b[0m \n",
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Normalizing text\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[?25h"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split for feature generation\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/2 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating MFCCs\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating CMVN\u001b[33m...\u001b[0m \n",
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating final features\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[?25h"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split with features\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[?25h"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Compiling training graphs\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing first-pass alignment\u001b[33m...\u001b[0m \n",
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating fMLLR for speaker adaptation\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing second-pass alignment\u001b[33m...\u001b[0m \n",
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Collecting phone and word alignments from alignment lattices\u001b[33m...\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[?25h"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2;36m \u001b[0m\u001b[33mWARNING \u001b[0m Alignment analysis not available without using postgresql \n",
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Exporting alignment TextGrids to demo/temp/mfa_alignments\u001b[33m...\u001b[0m \n",
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Finished exporting TextGrids to demo/temp/mfa_alignments! \n",
"\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Done! Everything took \u001b[1;36m40.634\u001b[0m seconds \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
"\u001b[?25h"
]
}
],
"source": [
"# hyperparameters for inference\n",
"left_margin = 0.08 # not used for TTS, only for speech editing\n",
"right_margin = 0.08 # not used for TTS, only for speech editing\n",
"codec_audio_sr = 16000\n",
"codec_sr = 50\n",
"top_k = 0\n",
"top_p = 0.8\n",
"temperature = 1\n",
"kvcache = 1\n",
"silence_tokens=[1388,1898,131]\n",
"# adjust the below three arguments if the generation is not as good\n",
"seed = 1 # random seed magic\n",
"stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n",
"sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n",
"# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"# point to the original file or record the file\n",
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
"\n",
"# move the audio and transcript to temp folder\n",
"temp_folder = \"./demo/temp\"\n",
"os.makedirs(temp_folder, exist_ok=True)\n",
"os.system(f\"cp {orig_audio} {temp_folder}\")\n",
"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
" f.write(orig_transcript)\n",
"# run MFA to get the alignment\n",
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
"os.makedirs(align_temp, exist_ok=True)\n",
"os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
"# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
"# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
"transcript_fn = f\"{temp_folder}/{filename}.txt\"\n",
"align_fn = f\"{align_temp}/{filename}.csv\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"concatenate prompt and generated:\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"generated:\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
"info = torchaudio.info(audio_fn)\n",
"audio_dur = info.num_frames / info.sample_rate\n",
"\n",
"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
"\n",
"\n",
"# # load model, tokenizer, and other necessary files\n",
"from models import voicecraft\n",
"voicecraft_name=\"giga830M.pth\"\n",
"ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
"if not os.path.exists(ckpt_fn):\n",
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
" os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
"if not os.path.exists(encodec_fn):\n",
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
" os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
"\n",
"ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
"model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
"model.load_state_dict(ckpt[\"model\"])\n",
"model.to(device)\n",
"model.eval()\n",
"\n",
"phn2num = ckpt['phn2num']\n",
"\n",
"text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
"\n",
"# run the model to get the output\n",
"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",
"from inference_tts_scale import inference_one_sample\n",
"concated_audio, gen_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",
" \n",
"# save segments for comparison\n",
"concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n",
"# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",
"\n",
"\n",
"# display the audio\n",
"from IPython.display import Audio\n",
"print(\"concatenate prompt and generated:\")\n",
"display(Audio(concated_audio, rate=codec_audio_sr))\n",
"\n",
"print(\"generated:\")\n",
"display(Audio(gen_audio, rate=codec_audio_sr))\n",
"\n",
"# # save the audio\n",
"# # output_dir\n",
"# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",
"# os.makedirs(output_dir, exist_ok=True)\n",
"# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",
"# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n",
"\n",
"# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",
"# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",
"\n",
"# if you get error importing T5 in transformers\n",
"# try \n",
"# pip uninstall Pillow\n",
"# pip install Pillow\n",
"# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "voicecraft",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}