add vad

2025-12-16 08:27:48 +01:00 · 2023-12-27 10:50:43 +08:00
parent 6516dc40b5
commit 592a9cc429
3 changed files with 155 additions and 18 deletions
--- a/demo_part1.ipynb
+++ b/demo_part1.ipynb
@@ -10,10 +10,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "id": "b7f043ee",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/zwl/anaconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Importing the dtw module. When using in academic works please cite:\n",
+      "  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n",
+      "  J. Stat. Soft., doi:10.18637/jss.v031.i07.\n",
+      "\n"
+     ]
+    }
+   ],
   "source": [
    "import os\n",
    "import torch\n",
@@ -31,10 +50,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "aacad912",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n",
+      "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n"
+     ]
+    }
+   ],
   "source": [
    "ckpt_base = 'checkpoints/base_speakers/EN'\n",
    "ckpt_converter = 'checkpoints/converter'\n",
@@ -70,7 +100,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "id": "63ff6273",
   "metadata": {},
   "outputs": [],
@@ -88,13 +118,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "id": "55105eae",
   "metadata": {},
   "outputs": [],
   "source": [
    "reference_speaker = 'resources/example_reference.mp3'\n",
-    "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter)"
+    "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
   ]
  },
  {
@@ -107,15 +137,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "id": "73dc1259",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "This audio is generated by OpenVoice.\n",
+      " > ===========================\n",
+      "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+      " length:45\n",
+      " length:45\n"
+     ]
+    }
+   ],
   "source": [
    "save_path = f'{output_dir}/output_en_default.wav'\n",
    "\n",
    "# Run the base speaker tts\n",
-    "text = \"This audio is generated by open voice.\"\n",
+    "text = \"This audio is generated by OpenVoice.\"\n",
    "src_path = f'{output_dir}/tmp.wav'\n",
    "base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
    "\n",
@@ -139,16 +182,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "id": "fd022d38",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "This audio is generated by OpenVoice with a half-performance model.\n",
+      " > ===========================\n",
+      "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs wɪθ ə half-peɹfoɹmance* ˈmɑdəɫ.\n",
+      " length:76\n",
+      " length:75\n"
+     ]
+    }
+   ],
   "source": [
    "source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
    "save_path = f'{output_dir}/output_whispering.wav'\n",
    "\n",
    "# Run the base speaker tts\n",
-    "text = \"This audio is generated by open voice with a half-performance model.\"\n",
+    "text = \"This audio is generated by OpenVoice with a half-performance model.\"\n",
    "src_path = f'{output_dir}/tmp.wav'\n",
    "base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
    "\n",
@@ -172,10 +228,47 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "id": "a71d1387",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Building prefix dict from the default dictionary ...\n",
+      "Loading model from cache /tmp/jieba.cache\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n",
+      " > Text splitted to sentences.\n",
+      "今天天气真好, 我们一起出去吃饭吧.\n",
+      " > ===========================\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading model cost 0.808 seconds.\n",
+      "Prefix dict has been built successfully.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑,  wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.\n",
+      " length:85\n",
+      " length:85\n"
+     ]
+    }
+   ],
   "source": [
    "\n",
    "ckpt_base = 'checkpoints/base_speakers/ZH'\n",
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,5 +6,6 @@ numpy==1.22.0
 eng_to_ipa==0.0.2
 inflect==7.0.0
 unidecode==1.3.7
+whisper-timestamped==1.14.2
 openai
 python-dotenv
--- a/se_extractor.py
+++ b/se_extractor.py
@@ -4,11 +4,12 @@ import torch
 from glob import glob
 from pydub import AudioSegment
 from faster_whisper import WhisperModel
+from whisper_timestamped.transcribe import get_audio_tensor, remove_non_speech, get_vad_segments

 model_size = "medium"
 # Run on GPU with FP16
 model = None
-def split_audio(audio_path, target_dir='processed'):
+def split_audio_whisper(audio_path, target_dir='processed'):
    global model
    if model is None:
        model = WhisperModel(model_size, device="cuda", compute_type="float16")
@@ -67,7 +68,47 @@ def split_audio(audio_path, target_dir='processed'):
    return wavs_folder


-def get_se(audio_path, vc_model, target_dir='processed'):
+def split_audio_vad(audio_path, target_dir, split_seconds=10):
+    SAMPLE_RATE = 16000
+    audio_vad = get_audio_tensor(audio_path)
+    segments = get_vad_segments(
+        audio_vad,
+        output_sample=True,
+        min_speech_duration=0.1,
+        min_silence_duration=1,
+        method="silero",
+    )
+    segments = [(seg["start"], seg["end"]) for seg in segments]
+    segments = [(float(s)/SAMPLE_RATE, float(e)/SAMPLE_RATE) for s,e in segments]
+    print(segments)
+    audio_active = AudioSegment.silent(duration=0)
+    audio = AudioSegment.from_file(audio_path)
+
+    for start_time, end_time in segments:
+        audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
+    
+    audio_dur = audio_active.duration_seconds
+    print(f'after vad: dur = {audio_dur}')
+    audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
+    target_folder = os.path.join(target_dir, audio_name)
+    wavs_folder = os.path.join(target_folder, 'wavs')
+    os.makedirs(wavs_folder, exist_ok=True)
+    start_time = 0.
+    count = 0
+    while start_time < audio_dur:
+        end_time = min(start_time + split_seconds, audio_dur)
+        output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
+        audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
+        audio_seg.export(output_file, format='wav')
+        start_time = end_time
+        count += 1
+    return wavs_folder
+
+
+    
+
+
+def get_se(audio_path, vc_model, target_dir='processed', vad=True):
    device = vc_model.device

    audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
@@ -78,8 +119,10 @@ def get_se(audio_path, vc_model, target_dir='processed'):
        return se, audio_name
    if os.path.isdir(audio_path):
        wavs_folder = audio_path
+    elif vad:
+        wavs_folder = split_audio_vad(audio_path, target_dir)
    else:
-        wavs_folder = split_audio(audio_path, target_dir)
+        wavs_folder = split_audio_whisper(audio_path, target_dir)
    
    audio_segs = glob(f'{wavs_folder}/*.wav')
    if len(audio_segs) == 0: