mirror of
https://github.com/myshell-ai/OpenVoice.git
synced 2025-12-16 08:27:48 +01:00
add vad
This commit is contained in:
123
demo_part1.ipynb
123
demo_part1.ipynb
@@ -10,10 +10,29 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"id": "b7f043ee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/zwl/anaconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Importing the dtw module. When using in academic works please cite:\n",
|
||||
" T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n",
|
||||
" J. Stat. Soft., doi:10.18637/jss.v031.i07.\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
@@ -31,10 +50,21 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"id": "aacad912",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
|
||||
"missing/unexpected keys: [] []\n",
|
||||
"Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
|
||||
"missing/unexpected keys: [] []\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ckpt_base = 'checkpoints/base_speakers/EN'\n",
|
||||
"ckpt_converter = 'checkpoints/converter'\n",
|
||||
@@ -70,7 +100,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"id": "63ff6273",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -88,13 +118,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"id": "55105eae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"reference_speaker = 'resources/example_reference.mp3'\n",
|
||||
"target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter)"
|
||||
"target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -107,15 +137,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 10,
|
||||
"id": "73dc1259",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" > Text splitted to sentences.\n",
|
||||
"This audio is generated by OpenVoice.\n",
|
||||
" > ===========================\n",
|
||||
"ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
|
||||
" length:45\n",
|
||||
" length:45\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"save_path = f'{output_dir}/output_en_default.wav'\n",
|
||||
"\n",
|
||||
"# Run the base speaker tts\n",
|
||||
"text = \"This audio is generated by open voice.\"\n",
|
||||
"text = \"This audio is generated by OpenVoice.\"\n",
|
||||
"src_path = f'{output_dir}/tmp.wav'\n",
|
||||
"base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
|
||||
"\n",
|
||||
@@ -139,16 +182,29 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 11,
|
||||
"id": "fd022d38",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" > Text splitted to sentences.\n",
|
||||
"This audio is generated by OpenVoice with a half-performance model.\n",
|
||||
" > ===========================\n",
|
||||
"ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs wɪθ ə half-peɹfoɹmance* ˈmɑdəɫ.\n",
|
||||
" length:76\n",
|
||||
" length:75\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
|
||||
"save_path = f'{output_dir}/output_whispering.wav'\n",
|
||||
"\n",
|
||||
"# Run the base speaker tts\n",
|
||||
"text = \"This audio is generated by open voice with a half-performance model.\"\n",
|
||||
"text = \"This audio is generated by OpenVoice with a half-performance model.\"\n",
|
||||
"src_path = f'{output_dir}/tmp.wav'\n",
|
||||
"base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
|
||||
"\n",
|
||||
@@ -172,10 +228,47 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 12,
|
||||
"id": "a71d1387",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Building prefix dict from the default dictionary ...\n",
|
||||
"Loading model from cache /tmp/jieba.cache\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
|
||||
"missing/unexpected keys: [] []\n",
|
||||
" > Text splitted to sentences.\n",
|
||||
"今天天气真好, 我们一起出去吃饭吧.\n",
|
||||
" > ===========================\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading model cost 0.808 seconds.\n",
|
||||
"Prefix dict has been built successfully.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑, wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.\n",
|
||||
" length:85\n",
|
||||
" length:85\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"ckpt_base = 'checkpoints/base_speakers/ZH'\n",
|
||||
|
||||
@@ -6,5 +6,6 @@ numpy==1.22.0
|
||||
eng_to_ipa==0.0.2
|
||||
inflect==7.0.0
|
||||
unidecode==1.3.7
|
||||
whisper-timestamped==1.14.2
|
||||
openai
|
||||
python-dotenv
|
||||
@@ -4,11 +4,12 @@ import torch
|
||||
from glob import glob
|
||||
from pydub import AudioSegment
|
||||
from faster_whisper import WhisperModel
|
||||
from whisper_timestamped.transcribe import get_audio_tensor, remove_non_speech, get_vad_segments
|
||||
|
||||
model_size = "medium"
|
||||
# Run on GPU with FP16
|
||||
model = None
|
||||
def split_audio(audio_path, target_dir='processed'):
|
||||
def split_audio_whisper(audio_path, target_dir='processed'):
|
||||
global model
|
||||
if model is None:
|
||||
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
||||
@@ -67,7 +68,47 @@ def split_audio(audio_path, target_dir='processed'):
|
||||
return wavs_folder
|
||||
|
||||
|
||||
def get_se(audio_path, vc_model, target_dir='processed'):
|
||||
def split_audio_vad(audio_path, target_dir, split_seconds=10):
|
||||
SAMPLE_RATE = 16000
|
||||
audio_vad = get_audio_tensor(audio_path)
|
||||
segments = get_vad_segments(
|
||||
audio_vad,
|
||||
output_sample=True,
|
||||
min_speech_duration=0.1,
|
||||
min_silence_duration=1,
|
||||
method="silero",
|
||||
)
|
||||
segments = [(seg["start"], seg["end"]) for seg in segments]
|
||||
segments = [(float(s)/SAMPLE_RATE, float(e)/SAMPLE_RATE) for s,e in segments]
|
||||
print(segments)
|
||||
audio_active = AudioSegment.silent(duration=0)
|
||||
audio = AudioSegment.from_file(audio_path)
|
||||
|
||||
for start_time, end_time in segments:
|
||||
audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
|
||||
|
||||
audio_dur = audio_active.duration_seconds
|
||||
print(f'after vad: dur = {audio_dur}')
|
||||
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
|
||||
target_folder = os.path.join(target_dir, audio_name)
|
||||
wavs_folder = os.path.join(target_folder, 'wavs')
|
||||
os.makedirs(wavs_folder, exist_ok=True)
|
||||
start_time = 0.
|
||||
count = 0
|
||||
while start_time < audio_dur:
|
||||
end_time = min(start_time + split_seconds, audio_dur)
|
||||
output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
|
||||
audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
|
||||
audio_seg.export(output_file, format='wav')
|
||||
start_time = end_time
|
||||
count += 1
|
||||
return wavs_folder
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def get_se(audio_path, vc_model, target_dir='processed', vad=True):
|
||||
device = vc_model.device
|
||||
|
||||
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
|
||||
@@ -78,8 +119,10 @@ def get_se(audio_path, vc_model, target_dir='processed'):
|
||||
return se, audio_name
|
||||
if os.path.isdir(audio_path):
|
||||
wavs_folder = audio_path
|
||||
elif vad:
|
||||
wavs_folder = split_audio_vad(audio_path, target_dir)
|
||||
else:
|
||||
wavs_folder = split_audio(audio_path, target_dir)
|
||||
wavs_folder = split_audio_whisper(audio_path, target_dir)
|
||||
|
||||
audio_segs = glob(f'{wavs_folder}/*.wav')
|
||||
if len(audio_segs) == 0:
|
||||
|
||||
Reference in New Issue
Block a user