From 72b0b67994ebb128ba7866e9e69e7ac7080fe265 Mon Sep 17 00:00:00 2001
From: Mangio621 <cole.p.mangio@gmail.com>
Date: Thu, 25 May 2023 21:06:37 +1000
Subject: [PATCH] Added hybrid f0 on training (f0 feature extraction).
 EXPERIMENTAL. Also added dio to the hybrid method options. Also fixed normal
 dio inference including a median filter.

---
 README.md            |   7 ++-
 extract_f0_print.py  | 135 +++++++++++++++++++++++++++++++++++++++++++
 vc_infer_pipeline.py |  27 ++++++++-
 3 files changed, 167 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 709a2c0..ccbec18 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,7 @@ Special thanks to discord user @kalomaze#2983 for creating a temporary colab not
 + Added CLI functionality
   + added --is_cli flag on infer-web.py to use the CLI system.
 + f0 hybrid (median) estimation method by calculating nanmedian for a specified array of f0 methods to get the best of all worlds for all specified f0 methods. Only for CLI right now. Soon to be implemented into GUI 🌟
++ f0 hybrid (median) estimation method on f0 feature extraction (training). (VERY EXPERIMENTAL PROBABLY EXTREMELY BUGGY). Feature extraction with the hybrid method will take MUCH longer.
 
 ## This repository has the following features too:
 + Reduce tone leakage by replacing source feature to training-set feature using top1 retrieval;
@@ -88,9 +89,13 @@ hybrid[pm+harvest+crepe]
 
 Many f0 methods may be used. But are to be split with a delimiter of the '+' character. Keep in mind that inference will take much longer as we are calculating f0 X more times.
 
-# About this fork's crepe training: 
+# About this fork's f0 training additional features.
+## Crepe f0 feature extraction
 Crepe training is still incredibly instable and there's been report of a memory leak. This will be fixed in the future, however it works quite well on paperspace machines. Please note that crepe training adds a little bit of difference against a harvest trained model. Crepe sounds clearer on some parts, but sounds more robotic on some parts too. Both I would say are equally good to train with, but I still think crepe on INFERENCE is not only quicker, but more pitch stable (especially with vocal layers). Right now, its quite stable to train with a harvest model and infer it with crepe. If you are training with crepe however (f0 feature extraction), please make sure your datasets are as dry as possible to reduce artifacts and unwanted harmonics as I assume the crepe pitch estimation latches on to reverb more.
 
+## Hybrid f0 feature extraction
+Only for CLI (not implemented in GUI yet). Basically the same as usage described in this readme's f0 hybrid on inference section. Instead of stating "harvest" into your arguments in the f0 feature extraction page, you would use "hybrid[harvest+dio+pm+crepe]" for example. This f0 nanmedian hybrid method will take very long during feature extraction. Please, if you're willing to use hybrid f0, be patient.
+
 ## If you get CUDA issues with crepe training, or pm and harvest etc.
 This is due to the number of processes (n_p) being too high. Make sure to cut the number of threads down. Please lower the value of the "Number of CPU Threads to use" slider on the feature extraction GUI.  
 
diff --git a/extract_f0_print.py b/extract_f0_print.py
index 9053235..fee0ab8 100644
--- a/extract_f0_print.py
+++ b/extract_f0_print.py
@@ -9,6 +9,7 @@ import numpy as np, logging
 import torchcrepe # Fork Feature. Crepe algo for training and preprocess
 import torch
 from torch import Tensor # Fork Feature. Used for pitch prediction for torch crepe.
+import scipy.signal as signal # Fork Feature hybrid inference
 
 logging.getLogger("numba").setLevel(logging.WARNING)
 from multiprocessing import Process
@@ -40,6 +41,126 @@ class FeatureInput(object):
         self.f0_min = 50.0
         self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
         self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+    
+    # EXPERIMENTAL. PROBABLY BUGGY
+    def get_f0_hybrid_computation(
+        self, 
+        methods_str, 
+        x,
+        f0_min,
+        f0_max,
+        p_len,
+        crepe_hop_length,
+        time_step,
+    ):
+        # Get various f0 methods from input to use in the computation stack
+        s = methods_str
+        s = s.split('hybrid')[1]
+        s = s.replace('[', '').replace(']', '')
+        methods = s.split('+')
+        f0_computation_stack = []
+
+        print("Calculating f0 pitch estimations for methods: %s" % str(methods))
+        x = x.astype(np.float32)
+        x /= np.quantile(np.abs(x), 0.999)
+        # Get f0 calculations for all methods specified
+        for method in methods:
+            f0 = None
+            if method == "pm":
+                f0 = (
+                    parselmouth.Sound(x, self.sr)
+                    .to_pitch_ac(
+                        time_step=time_step / 1000,
+                        voicing_threshold=0.6,
+                        pitch_floor=f0_min,
+                        pitch_ceiling=f0_max,
+                    )
+                    .selected_array["frequency"]
+                )
+                pad_size = (p_len - len(f0) + 1) // 2
+                if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                    f0 = np.pad(
+                        f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                    )
+            elif method == "crepe":
+                print("Performing crepe pitch extraction. (EXPERIMENTAL)")
+                print("CREPE PITCH EXTRACTION HOP LENGTH: " + str(crepe_hop_length))
+                x = x.astype(np.float32)
+                x /= np.quantile(np.abs(x), 0.999)
+                torch_device_index = 0
+                torch_device = None
+                if torch.cuda.is_available():
+                    torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
+                elif torch.backends.mps.is_available():
+                    torch_device = torch.device("mps")
+                else:
+                    torch_device = torch.device("cpu")
+                audio = torch.from_numpy(x).to(torch_device, copy=True)
+                audio = torch.unsqueeze(audio, dim=0)
+                if audio.ndim == 2 and audio.shape[0] > 1:
+                    audio = torch.mean(audio, dim=0, keepdim=True).detach()
+                audio = audio.detach()
+                print(
+                    "Initiating f0 Crepe Feature Extraction with an extraction_crepe_hop_length of: " +
+                    str(crepe_hop_length)
+                )
+                # Pitch prediction for pitch extraction
+                pitch: Tensor = torchcrepe.predict(
+                    audio,
+                    self.fs,
+                    crepe_hop_length,
+                    self.f0_min,
+                    self.f0_max,
+                    "full",
+                    batch_size=crepe_hop_length * 2,
+                    device=torch_device,
+                    pad=True                
+                )
+                p_len = p_len or x.shape[0] // crepe_hop_length
+                # Resize the pitch
+                source = np.array(pitch.squeeze(0).cpu().float().numpy())
+                source[source < 0.001] = np.nan
+                target = np.interp(
+                    np.arange(0, len(source) * p_len, len(source)) / p_len,
+                    np.arange(0, len(source)),
+                    source
+                )
+                f0 = np.nan_to_num(target)
+            elif method == "harvest":
+                f0, t = pyworld.harvest(
+                    x.astype(np.double),
+                    fs=self.fs,
+                    f0_ceil=self.f0_max,
+                    f0_floor=self.f0_min,
+                    frame_period=1000 * self.hop / self.fs,
+                )
+                f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
+                f0 = signal.medfilt(f0, 3)
+                f0 = f0[1:]
+            elif method == "dio":
+                f0, t = pyworld.dio(
+                    x.astype(np.double),
+                    fs=self.fs,
+                    f0_ceil=self.f0_max,
+                    f0_floor=self.f0_min,
+                    frame_period=1000 * self.hop / self.fs,
+                )
+                f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
+                f0 = signal.medfilt(f0, 3)
+                f0 = f0[1:]
+            f0_computation_stack.append(f0)
+        
+        for fc in f0_computation_stack:
+            print(len(fc))
+
+        print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
+        
+        f0_median_hybrid = None
+        if len(f0_computation_stack) > 1:
+            f0_median_hybrid = f0_computation_stack[0]
+        else:
+            f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
+        return f0_median_hybrid
 
     def compute_f0(self, path, f0_method, crepe_hop_length):
         x = load_audio(path, self.fs)
@@ -123,6 +244,20 @@ class FeatureInput(object):
                 source
             )
             f0 = np.nan_to_num(target)
+        elif "hybrid" in f0_method: # EXPERIMENTAL
+            # Perform hybrid median pitch estimation
+            time_step = 160 / 16000 * 1000
+            f0 = self.get_f0_hybrid_computation(
+                f0_method, 
+                x,
+                self.f0_min,
+                self.f0_max,
+                p_len,
+                crepe_hop_length,
+                time_step
+            )
+        # Mangio-RVC-Fork Feature: Add hybrid f0 inference to feature extraction. EXPERIMENTAL...
+
         return f0
 
     def coarse_f0(self, f0):
diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py
index ddd80bb..9306ae0 100644
--- a/vc_infer_pipeline.py
+++ b/vc_infer_pipeline.py
@@ -178,6 +178,17 @@ class VC(object):
                 if filter_radius > 2:
                     f0 = signal.medfilt(f0, 3)
                 f0 = f0[1:] # Get rid of first frame.
+            elif method == "dio": # Potentially buggy?
+                f0, t = pyworld.dio(
+                    x.astype(np.double),
+                    fs=self.sr,
+                    f0_ceil=f0_max,
+                    f0_floor=f0_min,
+                    frame_period=10
+                )
+                f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
+                f0 = signal.medfilt(f0, 3)
+                f0 = f0[1:]
             #elif method == "pyin": Not Working just yet
             #    f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
             # Push method to the stack
@@ -187,7 +198,11 @@ class VC(object):
             print(len(fc))
 
         print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
-        f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
+        f0_median_hybrid = None
+        if len(f0_computation_stack) > 1:
+            f0_median_hybrid = f0_computation_stack[0]
+        else:
+            f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
         return f0_median_hybrid
 
     def get_f0(
@@ -228,6 +243,16 @@ class VC(object):
             f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
             if filter_radius > 2:
                 f0 = signal.medfilt(f0, 3)
+        elif f0_method == "dio": # Potentially Buggy?
+            f0, t = pyworld.dio(
+                x.astype(np.double),
+                fs=self.sr,
+                f0_ceil=f0_max,
+                f0_floor=f0_min,
+                frame_period=10
+            )
+            f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
+            f0 = signal.medfilt(f0, 3)
         elif f0_method == "crepe": # Fork Feature: Adding a new f0 algorithm called crepe
             f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
         elif f0_method == "crepe-tiny": # Fork Feature add crepe-tiny model