This commit is contained in:
wl-zhao
2023-12-27 10:50:43 +08:00
parent 6516dc40b5
commit 592a9cc429
3 changed files with 155 additions and 18 deletions

View File

@@ -10,10 +10,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "b7f043ee",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/zwl/anaconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Importing the dtw module. When using in academic works please cite:\n",
" T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n",
" J. Stat. Soft., doi:10.18637/jss.v031.i07.\n",
"\n"
]
}
],
"source": [
"import os\n",
"import torch\n",
@@ -31,10 +50,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "aacad912",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
"missing/unexpected keys: [] []\n",
"Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
"missing/unexpected keys: [] []\n"
]
}
],
"source": [
"ckpt_base = 'checkpoints/base_speakers/EN'\n",
"ckpt_converter = 'checkpoints/converter'\n",
@@ -70,7 +100,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "63ff6273",
"metadata": {},
"outputs": [],
@@ -88,13 +118,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "55105eae",
"metadata": {},
"outputs": [],
"source": [
"reference_speaker = 'resources/example_reference.mp3'\n",
"target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter)"
"target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
]
},
{
@@ -107,15 +137,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "73dc1259",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" > Text splitted to sentences.\n",
"This audio is generated by OpenVoice.\n",
" > ===========================\n",
ɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
" length:45\n",
" length:45\n"
]
}
],
"source": [
"save_path = f'{output_dir}/output_en_default.wav'\n",
"\n",
"# Run the base speaker tts\n",
"text = \"This audio is generated by open voice.\"\n",
"text = \"This audio is generated by OpenVoice.\"\n",
"src_path = f'{output_dir}/tmp.wav'\n",
"base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
"\n",
@@ -139,16 +182,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"id": "fd022d38",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" > Text splitted to sentences.\n",
"This audio is generated by OpenVoice with a half-performance model.\n",
" > ===========================\n",
ɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs wɪθ ə half-peɹfoɹmance* ˈmɑdəɫ.\n",
" length:76\n",
" length:75\n"
]
}
],
"source": [
"source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
"save_path = f'{output_dir}/output_whispering.wav'\n",
"\n",
"# Run the base speaker tts\n",
"text = \"This audio is generated by open voice with a half-performance model.\"\n",
"text = \"This audio is generated by OpenVoice with a half-performance model.\"\n",
"src_path = f'{output_dir}/tmp.wav'\n",
"base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
"\n",
@@ -172,10 +228,47 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"id": "a71d1387",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache /tmp/jieba.cache\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
"missing/unexpected keys: [] []\n",
" > Text splitted to sentences.\n",
"今天天气真好, 我们一起出去吃饭吧.\n",
" > ===========================\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading model cost 0.808 seconds.\n",
"Prefix dict has been built successfully.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑, wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.\n",
" length:85\n",
" length:85\n"
]
}
],
"source": [
"\n",
"ckpt_base = 'checkpoints/base_speakers/ZH'\n",

View File

@@ -6,5 +6,6 @@ numpy==1.22.0
eng_to_ipa==0.0.2
inflect==7.0.0
unidecode==1.3.7
whisper-timestamped==1.14.2
openai
python-dotenv

View File

@@ -4,11 +4,12 @@ import torch
from glob import glob
from pydub import AudioSegment
from faster_whisper import WhisperModel
from whisper_timestamped.transcribe import get_audio_tensor, remove_non_speech, get_vad_segments
model_size = "medium"
# Run on GPU with FP16
model = None
def split_audio(audio_path, target_dir='processed'):
def split_audio_whisper(audio_path, target_dir='processed'):
global model
if model is None:
model = WhisperModel(model_size, device="cuda", compute_type="float16")
@@ -67,7 +68,47 @@ def split_audio(audio_path, target_dir='processed'):
return wavs_folder
def get_se(audio_path, vc_model, target_dir='processed'):
def split_audio_vad(audio_path, target_dir, split_seconds=10):
SAMPLE_RATE = 16000
audio_vad = get_audio_tensor(audio_path)
segments = get_vad_segments(
audio_vad,
output_sample=True,
min_speech_duration=0.1,
min_silence_duration=1,
method="silero",
)
segments = [(seg["start"], seg["end"]) for seg in segments]
segments = [(float(s)/SAMPLE_RATE, float(e)/SAMPLE_RATE) for s,e in segments]
print(segments)
audio_active = AudioSegment.silent(duration=0)
audio = AudioSegment.from_file(audio_path)
for start_time, end_time in segments:
audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
audio_dur = audio_active.duration_seconds
print(f'after vad: dur = {audio_dur}')
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
target_folder = os.path.join(target_dir, audio_name)
wavs_folder = os.path.join(target_folder, 'wavs')
os.makedirs(wavs_folder, exist_ok=True)
start_time = 0.
count = 0
while start_time < audio_dur:
end_time = min(start_time + split_seconds, audio_dur)
output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
audio_seg.export(output_file, format='wav')
start_time = end_time
count += 1
return wavs_folder
def get_se(audio_path, vc_model, target_dir='processed', vad=True):
device = vc_model.device
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
@@ -78,8 +119,10 @@ def get_se(audio_path, vc_model, target_dir='processed'):
return se, audio_name
if os.path.isdir(audio_path):
wavs_folder = audio_path
elif vad:
wavs_folder = split_audio_vad(audio_path, target_dir)
else:
wavs_folder = split_audio(audio_path, target_dir)
wavs_folder = split_audio_whisper(audio_path, target_dir)
audio_segs = glob(f'{wavs_folder}/*.wav')
if len(audio_segs) == 0: