diff --git a/.gitignore b/.gitignore index 82c8a74..630c32e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ __pycache__ *.pyd hubert_base.pt /logs +.venv diff --git a/LICENSE b/LICENSE index 8af94bd..4bb30b3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License Copyright (c) 2023 liujing04 +Copyright (c) 2023 源文雨 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -18,4 +19,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +SOFTWARE. diff --git a/MDXNet.py b/MDXNet.py index 6e996ac..19164b2 100644 --- a/MDXNet.py +++ b/MDXNet.py @@ -1,11 +1,9 @@ import soundfile as sf -import torch, pdb, time, argparse, os, warnings, sys, librosa +import torch, pdb, os, warnings, librosa import numpy as np import onnxruntime as ort -from scipy.io.wavfile import write from tqdm import tqdm import torch -import torch.nn as nn dim_c = 4 diff --git a/使用需遵守的协议-LICENSE.txt b/MIT协议暨相关引用库协议 similarity index 51% rename from 使用需遵守的协议-LICENSE.txt rename to MIT协议暨相关引用库协议 index db2094b..dbb6c6d 100644 --- a/使用需遵守的协议-LICENSE.txt +++ b/MIT协议暨相关引用库协议 @@ -1,50 +1,45 @@ -MIT License - -Copyright (c) 2023 liujing04 -Copyright (c) 2023 源文雨 - - 本软件及其相关代码以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。 - 如不认可该条款,则不能使用或引用软件包内任何代码和文件。 - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -特此授予任何获得本软件和相关文档文件(以下简称“软件”)副本的人免费使用、复制、修改、合并、出版、分发、再授权和/或销售本软件的权利,以及授予本软件所提供的人使用本软件的权利,但须符合以下条件: -上述版权声明和本许可声明应包含在软件的所有副本或实质部分中。 -软件是“按原样”提供的,没有任何明示或暗示的保证,包括但不限于适销性、适用于特定目的和不侵权的保证。在任何情况下,作者或版权持有人均不承担因软件或软件的使用或其他交易而产生、产生或与之相关的任何索赔、损害赔偿或其他责任,无论是在合同诉讼、侵权诉讼还是其他诉讼中。 - -相关引用库协议如下: -################# -ContentVec -https://github.com/auspicious3000/contentvec/blob/main/LICENSE -MIT License -################# -VITS -https://github.com/jaywalnut310/vits/blob/main/LICENSE -MIT License -################# -HIFIGAN -https://github.com/jik876/hifi-gan/blob/master/LICENSE -MIT License -################# -gradio -https://github.com/gradio-app/gradio/blob/main/LICENSE -Apache License 2.0 -################# -ffmpeg -https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv3 -https://github.com/BtbN/FFmpeg-Builds/releases/download/autobuild-2021-02-28-12-32/ffmpeg-n4.3.2-160-gfbb9368226-win64-lgpl-4.3.zip -LPGLv3 License -MIT License -################# -ultimatevocalremovergui -https://github.com/Anjok07/ultimatevocalremovergui/blob/master/LICENSE -https://github.com/yang123qwe/vocal_separation_by_uvr5 -MIT License -################# -audio-slicer -https://github.com/openvpi/audio-slicer/blob/main/LICENSE -MIT License +本软件及其相关代码以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。 +如不认可该条款,则不能使用或引用软件包内任何代码和文件。 + +特此授予任何获得本软件和相关文档文件(以下简称“软件”)副本的人免费使用、复制、修改、合并、出版、分发、再授权和/或销售本软件的权利,以及授予本软件所提供的人使用本软件的权利,但须符合以下条件: +上述版权声明和本许可声明应包含在软件的所有副本或实质部分中。 +软件是“按原样”提供的,没有任何明示或暗示的保证,包括但不限于适销性、适用于特定目的和不侵权的保证。在任何情况下,作者或版权持有人均不承担因软件或软件的使用或其他交易而产生、产生或与之相关的任何索赔、损害赔偿或其他责任,无论是在合同诉讼、侵权诉讼还是其他诉讼中。 + + +The LICENCEs for related libraries are as follows. +相关引用库协议如下: + +ContentVec +https://github.com/auspicious3000/contentvec/blob/main/LICENSE +MIT License + +VITS +https://github.com/jaywalnut310/vits/blob/main/LICENSE +MIT License + +HIFIGAN +https://github.com/jik876/hifi-gan/blob/master/LICENSE +MIT License + +gradio +https://github.com/gradio-app/gradio/blob/main/LICENSE +Apache License 2.0 + +ffmpeg +https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv3 +https://github.com/BtbN/FFmpeg-Builds/releases/download/autobuild-2021-02-28-12-32/ffmpeg-n4.3.2-160-gfbb9368226-win64-lgpl-4.3.zip +LPGLv3 License +MIT License + +ultimatevocalremovergui +https://github.com/Anjok07/ultimatevocalremovergui/blob/master/LICENSE +https://github.com/yang123qwe/vocal_separation_by_uvr5 +MIT License + +audio-slicer +https://github.com/openvpi/audio-slicer/blob/main/LICENSE +MIT License + +PySimpleGUI +https://github.com/PySimpleGUI/PySimpleGUI/blob/master/license.txt +LPGLv3 License diff --git a/Retrieval_based_Voice_Conversion_WebUI.ipynb b/Retrieval_based_Voice_Conversion_WebUI.ipynb index d1c95b2..4890daf 100644 --- a/Retrieval_based_Voice_Conversion_WebUI.ipynb +++ b/Retrieval_based_Voice_Conversion_WebUI.ipynb @@ -20,7 +20,7 @@ { "cell_type": "markdown", "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb)" ], "metadata": { "id": "ZFFCx5J80SGa" diff --git a/app.py b/app.py index 8688973..d8264b8 100644 --- a/app.py +++ b/app.py @@ -1,4 +1,3 @@ -import io import os import torch @@ -6,14 +5,12 @@ import torch import gradio as gr import librosa import numpy as np -import soundfile import logging from fairseq import checkpoint_utils -from my_utils import load_audio from vc_infer_pipeline import VC import traceback from config import Config -from infer_pack.models import ( +from lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, diff --git a/config.py b/config.py index 6d8873a..2f64e13 100644 --- a/config.py +++ b/config.py @@ -1,9 +1,10 @@ import argparse +import sys import torch from multiprocessing import cpu_count -def config_file_change_fp32(): +def use_fp32_config(): for config_file in ["32k.json", "40k.json", "48k.json"]: with open(f"configs/{config_file}", "r") as f: strr = f.read().replace("true", "false") @@ -36,11 +37,10 @@ class Config: @staticmethod def arg_parse() -> tuple: + exe = sys.executable or "python" parser = argparse.ArgumentParser() parser.add_argument("--port", type=int, default=7865, help="Listen port") - parser.add_argument( - "--pycmd", type=str, default="python", help="Python command" - ) + parser.add_argument("--pycmd", type=str, default=exe, help="Python command") parser.add_argument("--colab", action="store_true", help="Launch in colab") parser.add_argument( "--noparallel", action="store_true", help="Disable parallel processing" @@ -70,6 +70,18 @@ class Config: cmd_opts.is_cli, ) + # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+. + # check `getattr` and try it for compatibility + @staticmethod + def has_mps() -> bool: + if not torch.backends.mps.is_available(): + return False + try: + torch.zeros(1).to(torch.device("mps")) + return True + except Exception: + return False + def device_config(self) -> tuple: if torch.cuda.is_available(): i_device = int(self.device.split(":")[-1]) @@ -81,11 +93,11 @@ class Config: or "1070" in self.gpu_name or "1080" in self.gpu_name ): - print("16系/10系显卡和P40强制单精度") + print("Found GPU", self.gpu_name, ", force to fp32") self.is_half = False - config_file_change_fp32() + use_fp32_config() else: - self.gpu_name = None + print("Found GPU", self.gpu_name) self.gpu_mem = int( torch.cuda.get_device_properties(i_device).total_memory / 1024 @@ -98,16 +110,16 @@ class Config: strr = f.read().replace("3.7", "3.0") with open("trainset_preprocess_pipeline_print.py", "w") as f: f.write(strr) - elif torch.backends.mps.is_available(): - print("没有发现支持的N卡, 使用MPS进行推理") + elif self.has_mps(): + print("No supported Nvidia GPU found, use MPS instead") self.device = "mps" self.is_half = False - config_file_change_fp32() + use_fp32_config() else: - print("没有发现支持的N卡, 使用CPU进行推理") + print("No supported Nvidia GPU found, use CPU instead") self.device = "cpu" self.is_half = False - config_file_change_fp32() + use_fp32_config() if self.n_cpu == 0: self.n_cpu = cpu_count() diff --git a/configs/32k.json b/configs/32k.json index d5f16d6..400b6be 100644 --- a/configs/32k.json +++ b/configs/32k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": true, + "fp16_run": false, "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1, diff --git a/configs/40k.json b/configs/40k.json index 4ffc87b..cb30b8b 100644 --- a/configs/40k.json +++ b/configs/40k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": true, + "fp16_run": false, "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1, diff --git a/configs/48k.json b/configs/48k.json index 2d0e05b..6875991 100644 --- a/configs/48k.json +++ b/configs/48k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": true, + "fp16_run": false, "lr_decay": 0.999875, "segment_size": 11520, "init_lr_ratio": 1, diff --git a/Changelog_CN.md b/docs/Changelog_CN.md similarity index 98% rename from Changelog_CN.md rename to docs/Changelog_CN.md index 42a71ee..eb67ba5 100644 --- a/Changelog_CN.md +++ b/docs/Changelog_CN.md @@ -29,7 +29,7 @@ todolist: - 废弃32k模型的训练 ### 20230513更新 -- 清除一键包内部老版本runtime内残留的infer_pack和uvr5_pack +- 清除一键包内部老版本runtime内残留的lib.infer_pack和uvr5_pack - 修复训练集预处理伪多进程的bug - 增加harvest识别音高可选通过中值滤波削弱哑音现象,可调整中值滤波半径 - 导出音频增加后处理重采样 diff --git a/Changelog_EN.md b/docs/Changelog_EN.md similarity index 99% rename from Changelog_EN.md rename to docs/Changelog_EN.md index 8e2a5d1..20fc84c 100644 --- a/Changelog_EN.md +++ b/docs/Changelog_EN.md @@ -27,7 +27,7 @@ todolist: - v1 32k model training is no more supported ### 2023-05-13 -- Clear the redundant codes in the old version of runtime in the one-click-package: infer_pack and uvr5_pack +- Clear the redundant codes in the old version of runtime in the one-click-package: lib.infer_pack and uvr5_pack - Fix pseudo multiprocessing bug in training set preprocessing - Adding median filtering radius adjustment for harvest pitch recognize algorithm - Support post processing resampling for exporting audio diff --git a/Changelog_KO.md b/docs/Changelog_KO.md similarity index 99% rename from Changelog_KO.md rename to docs/Changelog_KO.md index 37e0891..52da1df 100644 --- a/Changelog_KO.md +++ b/docs/Changelog_KO.md @@ -33,7 +33,7 @@ ### 2023년 5월 13일 업데이트 -- 원클릭 패키지의 이전 버전 런타임 내, 불필요한 코드(infer_pack 및 uvr5_pack) 제거. +- 원클릭 패키지의 이전 버전 런타임 내, 불필요한 코드(lib.infer_pack 및 uvr5_pack) 제거. - 훈련 세트 전처리의 유사 다중 처리 버그 수정. - Harvest 피치 인식 알고리즘에 대한 중위수 필터링 반경 조정 추가. - 오디오 내보낼 때, 후처리 리샘플링 지원. diff --git a/docs/README.en.md b/docs/README.en.md index 211571d..40b357e 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -3,12 +3,12 @@

Retrieval-based-Voice-Conversion-WebUI

An easy-to-use Voice Conversion framework based on VITS.

-[![madewithlove](https://forthebadge.com/images/badges/built-with-love.svg)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) +[![madewithlove](https://forthebadge.com/images/badges/built-with-love.svg)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)
-[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) -[![Licence](https://img.shields.io/github/license/liujing04/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/%E4%BD%BF%E7%94%A8%E9%9C%80%E9%81%B5%E5%AE%88%E7%9A%84%E5%8D%8F%E8%AE%AE-LICENSE.txt) +[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) +[![Licence](https://img.shields.io/github/license/RVC-Project/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/LICENSE) [![Huggingface](https://img.shields.io/badge/🤗%20-Spaces-yellow.svg?style=for-the-badge)](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/) [![Discord](https://img.shields.io/badge/RVC%20Developers-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HcsmBBGyVk) @@ -16,7 +16,7 @@ An easy-to-use Voice Conversion framework based on VITS.

------ -[**Changelog**](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Changelog_CN.md) | [**FAQ (Frequently Asked Questions)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) +[**Changelog**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_EN.md) | [**FAQ (Frequently Asked Questions)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) [**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) @@ -50,7 +50,7 @@ The following commands need to be executed in the environment of Python version # Reference: https://pytorch.org/get-started/locally/ pip install torch torchvision torchaudio -#For Windows + Nvidia Ampere Architecture(RTX30xx), you need to specify the cuda version corresponding to pytorch according to the experience of https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/issues/21 +#For Windows + Nvidia Ampere Architecture(RTX30xx), you need to specify the cuda version corresponding to pytorch according to the experience of https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/issues/21 #pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 # Install the Poetry dependency management tool, skip if installed @@ -104,7 +104,7 @@ There's also a tutorial on RVC in Chinese and you can check it out if needed. + [audio-slicer](https://github.com/openvpi/audio-slicer) ## Thanks to all contributors for their efforts - - + + diff --git a/docs/README.ja.md b/docs/README.ja.md index cf47bd5..26ce3af 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -3,12 +3,12 @@

Retrieval-based-Voice-Conversion-WebUI

VITSに基づく使いやすい音声変換(voice changer)framework

-[![madewithlove](https://forthebadge.com/images/badges/built-with-love.svg)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) +[![madewithlove](https://forthebadge.com/images/badges/built-with-love.svg)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)
-[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) -[![Licence](https://img.shields.io/github/license/liujing04/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/%E4%BD%BF%E7%94%A8%E9%9C%80%E9%81%B5%E5%AE%88%E7%9A%84%E5%8D%8F%E8%AE%AE-LICENSE.txt) +[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) +[![Licence](https://img.shields.io/github/license/RVC-Project/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/LICENSE) [![Huggingface](https://img.shields.io/badge/🤗%20-Spaces-yellow.svg?style=for-the-badge)](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/) [![Discord](https://img.shields.io/badge/RVC%20Developers-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HcsmBBGyVk) @@ -17,7 +17,7 @@ VITSに基づく使いやすい音声変換(voice changer)framework

------ -[**更新日誌**](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Changelog_CN.md) +[**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_CN.md) [**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) @@ -99,6 +99,6 @@ Windowsをお使いの方は、直接`RVC-beta.7z`をダウンロード後に展 + [audio-slicer](https://github.com/openvpi/audio-slicer) ## 貢献者(contributor)の皆様の尽力に感謝します - - + + diff --git a/docs/README.ko.han.md b/docs/README.ko.han.md index 2b6bbff..cac9d70 100644 --- a/docs/README.ko.han.md +++ b/docs/README.ko.han.md @@ -3,12 +3,12 @@

Retrieval-based-Voice-Conversion-WebUI

VITS基盤의 簡單하고使用하기 쉬운音聲變換틀

-[![madewithlove](https://forthebadge.com/images/badges/built-with-love.svg)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) +[![madewithlove](https://forthebadge.com/images/badges/built-with-love.svg)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)
-[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) -[![Licence](https://img.shields.io/github/license/liujing04/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/%E4%BD%BF%E7%94%A8%E9%9C%80%E9%81%B5%E5%AE%88%E7%9A%84%E5%8D%8F%E8%AE%AE-LICENSE.txt) +[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) +[![Licence](https://img.shields.io/github/license/RVC-Project/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/LICENSE) [![Huggingface](https://img.shields.io/badge/🤗%20-Spaces-yellow.svg?style=for-the-badge)](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/) [![Discord](https://img.shields.io/badge/RVC%20Developers-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HcsmBBGyVk) @@ -16,7 +16,7 @@ VITS基盤의 簡單하고使用하기 쉬운音聲變換틀

------ -[**更新日誌**](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Changelog_CN.md) +[**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_KO.md) [**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) @@ -94,7 +94,7 @@ Windows를 使用하는境遇 `RVC-beta.7z`를 다운로드 및 壓縮解除하 + [audio-slicer](https://github.com/openvpi/audio-slicer) ## 모든寄與者분들의勞力에感謝드립니다 - - + + diff --git a/docs/README.ko.md b/docs/README.ko.md index 80897ef..abea8e6 100644 --- a/docs/README.ko.md +++ b/docs/README.ko.md @@ -3,12 +3,12 @@

Retrieval-based-Voice-Conversion-WebUI

VITS 기반의 간단하고 사용하기 쉬운 음성 변환 프레임워크.

-[![madewithlove](https://forthebadge.com/images/badges/built-with-love.svg)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) +[![madewithlove](https://forthebadge.com/images/badges/built-with-love.svg)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)
-[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) -[![Licence](https://img.shields.io/github/license/liujing04/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI/blob/main/%E4%BD%BF%E7%94%A8%E9%9C%80%E9%81%B5%E5%AE%88%E7%9A%84%E5%8D%8F%E8%AE%AE-LICENSE.txt) +[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb) +[![Licence](https://img.shields.io/github/license/RVC-Project/Retrieval-based-Voice-Conversion-WebUI?style=for-the-badge)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/LICENSE) [![Huggingface](https://img.shields.io/badge/🤗%20-Spaces-yellow.svg?style=for-the-badge)](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/) [![Discord](https://img.shields.io/badge/RVC%20Developers-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HcsmBBGyVk) @@ -17,7 +17,7 @@ VITS 기반의 간단하고 사용하기 쉬운 음성 변환 프레임워크. - + + diff --git a/environment_dml.yaml b/environment_dml.yaml new file mode 100644 index 0000000..0fb3f22 --- /dev/null +++ b/environment_dml.yaml @@ -0,0 +1,186 @@ +name: pydml +channels: + - pytorch + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/ + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/ + - defaults + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/fastai/ + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/ + - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/ +dependencies: + - abseil-cpp=20211102.0=hd77b12b_0 + - absl-py=1.3.0=py310haa95532_0 + - aiohttp=3.8.3=py310h2bbff1b_0 + - aiosignal=1.2.0=pyhd3eb1b0_0 + - async-timeout=4.0.2=py310haa95532_0 + - attrs=22.1.0=py310haa95532_0 + - blas=1.0=mkl + - blinker=1.4=py310haa95532_0 + - bottleneck=1.3.5=py310h9128911_0 + - brotli=1.0.9=h2bbff1b_7 + - brotli-bin=1.0.9=h2bbff1b_7 + - brotlipy=0.7.0=py310h2bbff1b_1002 + - bzip2=1.0.8=he774522_0 + - c-ares=1.19.0=h2bbff1b_0 + - ca-certificates=2023.05.30=haa95532_0 + - cachetools=4.2.2=pyhd3eb1b0_0 + - certifi=2023.5.7=py310haa95532_0 + - cffi=1.15.1=py310h2bbff1b_3 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.0.4=py310haa95532_0 + - colorama=0.4.6=py310haa95532_0 + - contourpy=1.0.5=py310h59b6b97_0 + - cryptography=39.0.1=py310h21b164f_0 + - cycler=0.11.0=pyhd3eb1b0_0 + - fonttools=4.25.0=pyhd3eb1b0_0 + - freetype=2.12.1=ha860e81_0 + - frozenlist=1.3.3=py310h2bbff1b_0 + - giflib=5.2.1=h8cc25b3_3 + - glib=2.69.1=h5dc1a3c_2 + - google-auth=2.6.0=pyhd3eb1b0_0 + - google-auth-oauthlib=0.4.4=pyhd3eb1b0_0 + - grpc-cpp=1.48.2=hf108199_0 + - grpcio=1.48.2=py310hf108199_0 + - gst-plugins-base=1.18.5=h9e645db_0 + - gstreamer=1.18.5=hd78058f_0 + - icu=58.2=ha925a31_3 + - idna=3.4=py310haa95532_0 + - intel-openmp=2023.1.0=h59b6b97_46319 + - jpeg=9e=h2bbff1b_1 + - kiwisolver=1.4.4=py310hd77b12b_0 + - krb5=1.19.4=h5b6d351_0 + - lerc=3.0=hd77b12b_0 + - libbrotlicommon=1.0.9=h2bbff1b_7 + - libbrotlidec=1.0.9=h2bbff1b_7 + - libbrotlienc=1.0.9=h2bbff1b_7 + - libclang=14.0.6=default_hb5a9fac_1 + - libclang13=14.0.6=default_h8e68704_1 + - libdeflate=1.17=h2bbff1b_0 + - libffi=3.4.4=hd77b12b_0 + - libiconv=1.16=h2bbff1b_2 + - libogg=1.3.5=h2bbff1b_1 + - libpng=1.6.39=h8cc25b3_0 + - libprotobuf=3.20.3=h23ce68f_0 + - libtiff=4.5.0=h6c2663c_2 + - libuv=1.44.2=h2bbff1b_0 + - libvorbis=1.3.7=he774522_0 + - libwebp=1.2.4=hbc33d0d_1 + - libwebp-base=1.2.4=h2bbff1b_1 + - libxml2=2.10.3=h0ad7f3c_0 + - libxslt=1.1.37=h2bbff1b_0 + - lz4-c=1.9.4=h2bbff1b_0 + - markdown=3.4.1=py310haa95532_0 + - markupsafe=2.1.1=py310h2bbff1b_0 + - matplotlib=3.7.1=py310haa95532_1 + - matplotlib-base=3.7.1=py310h4ed8f06_1 + - mkl=2023.1.0=h8bd8f75_46356 + - mkl-service=2.4.0=py310h2bbff1b_1 + - mkl_fft=1.3.6=py310h4ed8f06_1 + - mkl_random=1.2.2=py310h4ed8f06_1 + - multidict=6.0.2=py310h2bbff1b_0 + - munkres=1.1.4=py_0 + - numexpr=2.8.4=py310h2cd9be0_1 + - numpy=1.24.3=py310h055cbcc_1 + - numpy-base=1.24.3=py310h65a83cf_1 + - oauthlib=3.2.2=py310haa95532_0 + - openssl=1.1.1t=h2bbff1b_0 + - packaging=23.0=py310haa95532_0 + - pandas=1.5.3=py310h4ed8f06_0 + - pcre=8.45=hd77b12b_0 + - pillow=9.4.0=py310hd77b12b_0 + - pip=23.0.1=py310haa95532_0 + - ply=3.11=py310haa95532_0 + - protobuf=3.20.3=py310hd77b12b_0 + - pyasn1=0.4.8=pyhd3eb1b0_0 + - pyasn1-modules=0.2.8=py_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pyjwt=2.4.0=py310haa95532_0 + - pyopenssl=23.0.0=py310haa95532_0 + - pyparsing=3.0.9=py310haa95532_0 + - pyqt=5.15.7=py310hd77b12b_0 + - pyqt5-sip=12.11.0=py310hd77b12b_0 + - pysocks=1.7.1=py310haa95532_0 + - python=3.10.11=h966fe2a_2 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - pytorch-mutex=1.0=cpu + - pytz=2022.7=py310haa95532_0 + - pyyaml=6.0=py310h2bbff1b_1 + - qt-main=5.15.2=he8e5bd7_8 + - qt-webengine=5.15.9=hb9a9bb5_5 + - qtwebkit=5.212=h2bbfb41_5 + - re2=2022.04.01=hd77b12b_0 + - requests=2.29.0=py310haa95532_0 + - requests-oauthlib=1.3.0=py_0 + - rsa=4.7.2=pyhd3eb1b0_1 + - setuptools=67.8.0=py310haa95532_0 + - sip=6.6.2=py310hd77b12b_0 + - six=1.16.0=pyhd3eb1b0_1 + - sqlite=3.41.2=h2bbff1b_0 + - tbb=2021.8.0=h59b6b97_0 + - tensorboard=2.10.0=py310haa95532_0 + - tensorboard-data-server=0.6.1=py310haa95532_0 + - tensorboard-plugin-wit=1.8.1=py310haa95532_0 + - tk=8.6.12=h2bbff1b_0 + - toml=0.10.2=pyhd3eb1b0_0 + - tornado=6.2=py310h2bbff1b_0 + - tqdm=4.65.0=py310h9909e9c_0 + - typing_extensions=4.5.0=py310haa95532_0 + - tzdata=2023c=h04d1e81_0 + - urllib3=1.26.16=py310haa95532_0 + - vc=14.2=h21ff451_1 + - vs2015_runtime=14.27.29016=h5e58377_2 + - werkzeug=2.2.3=py310haa95532_0 + - wheel=0.38.4=py310haa95532_0 + - win_inet_pton=1.1.0=py310haa95532_0 + - xz=5.4.2=h8cc25b3_0 + - yaml=0.2.5=he774522_0 + - yarl=1.8.1=py310h2bbff1b_0 + - zlib=1.2.13=h8cc25b3_0 + - zstd=1.5.5=hd43e919_0 + - pip: + - antlr4-python3-runtime==4.8 + - appdirs==1.4.4 + - audioread==3.0.0 + - bitarray==2.7.4 + - cython==0.29.35 + - decorator==5.1.1 + - fairseq==0.12.2 + - faiss-cpu==1.7.4 + - filelock==3.12.0 + - hydra-core==1.0.7 + - jinja2==3.1.2 + - joblib==1.2.0 + - lazy-loader==0.2 + - librosa==0.10.0.post2 + - llvmlite==0.40.0 + - lxml==4.9.2 + - mpmath==1.3.0 + - msgpack==1.0.5 + - networkx==3.1 + - noisereduce==2.0.1 + - numba==0.57.0 + - omegaconf==2.0.6 + - opencv-python==4.7.0.72 + - pooch==1.6.0 + - portalocker==2.7.0 + - pysimplegui==4.60.5 + - pywin32==306 + - pyworld==0.3.3 + - regex==2023.5.5 + - sacrebleu==2.3.1 + - scikit-learn==1.2.2 + - scipy==1.10.1 + - sounddevice==0.4.6 + - soundfile==0.12.1 + - soxr==0.3.5 + - sympy==1.12 + - tabulate==0.9.0 + - threadpoolctl==3.1.0 + - torch==2.0.0 + - torch-directml==0.2.0.dev230426 + - torchaudio==2.0.1 + - torchvision==0.15.1 + - wget==3.2 +prefix: D:\ProgramData\anaconda3_\envs\pydml diff --git a/extract_f0_print.py b/extract_f0_print.py index ffb02be..e175e5e 100644 --- a/extract_f0_print.py +++ b/extract_f0_print.py @@ -4,7 +4,6 @@ now_dir = os.getcwd() sys.path.append(now_dir) from my_utils import load_audio import pyworld -from scipy.io import wavfile import numpy as np, logging import torchcrepe # Fork Feature. Crepe algo for training and preprocess import torch diff --git a/extract_feature_print.py b/extract_feature_print.py index cfc6e75..09d87c0 100644 --- a/extract_feature_print.py +++ b/extract_feature_print.py @@ -1,9 +1,12 @@ import os, sys, traceback +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" + # device=sys.argv[1] n_part = int(sys.argv[2]) i_part = int(sys.argv[3]) -if len(sys.argv) == 5: +if len(sys.argv) == 6: exp_dir = sys.argv[4] version = sys.argv[5] else: @@ -17,14 +20,11 @@ import soundfile as sf import numpy as np from fairseq import checkpoint_utils -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - +device = "cpu" if torch.cuda.is_available(): device = "cuda" elif torch.backends.mps.is_available(): device = "mps" -else: - device = "cpu" f = open("%s/extract_f0_feature.log" % exp_dir, "a+") diff --git a/extract_locale.py b/extract_locale.py index c42bda5..0f0ff82 100644 --- a/extract_locale.py +++ b/extract_locale.py @@ -22,8 +22,11 @@ def process(fn: str): print("processing infer-web.py") process("infer-web.py") -print("processing gui.py") -process("gui.py") +print("processing gui_v0.py") +process("gui_v0.py") + +print("processing gui_v1.py") +process("gui_v1.py") # Save as a JSON file with open("./i18n/zh_CN.json", "w", encoding="utf-8") as f: diff --git a/go-realtime-gui.bat b/go-realtime-gui.bat index ed07321..8c08290 100644 --- a/go-realtime-gui.bat +++ b/go-realtime-gui.bat @@ -1,2 +1,2 @@ -runtime\python.exe gui.py +runtime\python.exe gui_v1.py pause diff --git a/gui.py b/gui_v0.py similarity index 97% rename from gui.py rename to gui_v0.py index fbe59b5..2bd2e75 100644 --- a/gui.py +++ b/gui_v0.py @@ -1,15 +1,3 @@ -""" -0416后的更新: - 引入config中half - 重建npy而不用填写 - v2支持 - 无f0模型支持 - 修复 - - int16: - 增加无索引支持 - f0算法改harvest(怎么看就只有这个会影响CPU占用),但是不这么改效果不好 -""" import os, sys, traceback, re import json @@ -31,7 +19,7 @@ import scipy.signal as signal import torchcrepe # import matplotlib.pyplot as plt -from infer_pack.models import ( +from lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, @@ -302,7 +290,12 @@ class GUI: self.launcher() def load(self): - input_devices, output_devices, _, _ = self.get_devices() + ( + input_devices, + output_devices, + input_devices_indices, + output_devices_indices, + ) = self.get_devices() try: with open("values1.json", "r") as j: data = json.load(j) @@ -310,10 +303,14 @@ class GUI: # Injecting f0_method into the json data with open("values1.json", "w") as j: data = { - "pth_path": " ", - "index_path": " ", - "sg_input_device": input_devices[sd.default.device[0]], - "sg_output_device": output_devices[sd.default.device[1]], + "pth_path": "", + "index_path": "", + "sg_input_device": input_devices[ + input_devices_indices.index(sd.default.device[0]) + ], + "sg_output_device": output_devices[ + output_devices_indices.index(sd.default.device[1]) + ], "threhold": "-45", "pitch": "0", "index_rate": "0", @@ -349,7 +346,7 @@ class GUI: sg.FileBrowse( i18n("Hubert模型"), initial_folder=os.path.join(os.getcwd()), - file_types=((". pt"),), + file_types=(("pt files", "*.pt"),), ), ], [ @@ -360,7 +357,7 @@ class GUI: sg.FileBrowse( i18n("选择.pth文件"), initial_folder=os.path.join(os.getcwd(), "weights"), - file_types=((". pth"),), + file_types=(("weight files", "*.pth"),), ), ], [ @@ -371,7 +368,7 @@ class GUI: sg.FileBrowse( i18n("选择.index文件"), initial_folder=os.path.join(os.getcwd(), "logs"), - file_types=((". index"),), + file_types=(("index files", "*.index"),), ), ], [ @@ -383,7 +380,7 @@ class GUI: sg.FileBrowse( i18n("选择.npy文件"), initial_folder=os.path.join(os.getcwd(), "logs"), - file_types=((". npy"),), + file_types=(("feature files", "*.npy"),), ), ], ], @@ -639,6 +636,7 @@ class GUI: 接受音频输入 """ with sd.Stream( + channels=2, callback=self.audio_callback, blocksize=self.block_frame, samplerate=self.config.samplerate, diff --git a/gui_v1.py b/gui_v1.py new file mode 100644 index 0000000..07ff3c9 --- /dev/null +++ b/gui_v1.py @@ -0,0 +1,637 @@ +import os, sys + +if sys.platform == "darwin": + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +now_dir = os.getcwd() +sys.path.append(now_dir) +import multiprocessing + + +class Harvest(multiprocessing.Process): + def __init__(self, inp_q, opt_q): + multiprocessing.Process.__init__(self) + self.inp_q = inp_q + self.opt_q = opt_q + + def run(self): + import numpy as np, pyworld + + while 1: + idx, x, res_f0, n_cpu, ts = self.inp_q.get() + f0, t = pyworld.harvest( + x.astype(np.double), + fs=16000, + f0_ceil=1100, + f0_floor=50, + frame_period=10, + ) + res_f0[idx] = f0 + if len(res_f0.keys()) >= n_cpu: + self.opt_q.put(ts) + + +if __name__ == "__main__": + from multiprocessing import Queue + from queue import Empty + import numpy as np + import multiprocessing + import traceback, re + import json + import PySimpleGUI as sg + import sounddevice as sd + import noisereduce as nr + from multiprocessing import cpu_count + import librosa, torch, time, threading + import torch.nn.functional as F + import torchaudio.transforms as tat + from i18n import I18nAuto + + i18n = I18nAuto() + device = torch.device( + "cuda" + if torch.cuda.is_available() + else ("mps" if torch.backends.mps.is_available() else "cpu") + ) + current_dir = os.getcwd() + inp_q = Queue() + opt_q = Queue() + n_cpu = min(cpu_count(), 8) + for _ in range(n_cpu): + Harvest(inp_q, opt_q).start() + from rvc_for_realtime import RVC + + class GUIConfig: + def __init__(self) -> None: + self.pth_path: str = "" + self.index_path: str = "" + self.pitch: int = 12 + self.samplerate: int = 40000 + self.block_time: float = 1.0 # s + self.buffer_num: int = 1 + self.threhold: int = -30 + self.crossfade_time: float = 0.08 + self.extra_time: float = 0.04 + self.I_noise_reduce = False + self.O_noise_reduce = False + self.index_rate = 0.3 + self.n_cpu = min(n_cpu, 8) + self.f0method = "harvest" + + class GUI: + def __init__(self) -> None: + self.config = GUIConfig() + self.flag_vc = False + + self.launcher() + + def load(self): + input_devices, output_devices, _, _ = self.get_devices() + try: + with open("values1.json", "r") as j: + data = json.load(j) + data["pm"] = data["f0method"] == "pm" + data["harvest"] = data["f0method"] == "harvest" + data["crepe"] = data["f0method"] == "crepe" + data["rmvpe"] = data["f0method"] == "rmvpe" + except: + with open("values1.json", "w") as j: + data = { + "pth_path": " ", + "index_path": " ", + "sg_input_device": input_devices[sd.default.device[0]], + "sg_output_device": output_devices[sd.default.device[1]], + "threhold": "-45", + "pitch": "0", + "index_rate": "0", + "block_time": "1", + "crossfade_length": "0.04", + "extra_time": "1", + "f0method": "rmvpe", + } + return data + + def launcher(self): + data = self.load() + sg.theme("LightBlue3") + input_devices, output_devices, _, _ = self.get_devices() + layout = [ + [ + sg.Frame( + title=i18n("加载模型"), + layout=[ + [ + sg.Input( + default_text=data.get("pth_path", ""), + key="pth_path", + ), + sg.FileBrowse( + i18n("选择.pth文件"), + initial_folder=os.path.join(os.getcwd(), "weights"), + file_types=((". pth"),), + ), + ], + [ + sg.Input( + default_text=data.get("index_path", ""), + key="index_path", + ), + sg.FileBrowse( + i18n("选择.index文件"), + initial_folder=os.path.join(os.getcwd(), "logs"), + file_types=((". index"),), + ), + ], + ], + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("输入设备")), + sg.Combo( + input_devices, + key="sg_input_device", + default_value=data.get("sg_input_device", ""), + ), + ], + [ + sg.Text(i18n("输出设备")), + sg.Combo( + output_devices, + key="sg_output_device", + default_value=data.get("sg_output_device", ""), + ), + ], + ], + title=i18n("音频设备(请使用同种类驱动)"), + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("响应阈值")), + sg.Slider( + range=(-60, 0), + key="threhold", + resolution=1, + orientation="h", + default_value=data.get("threhold", ""), + ), + ], + [ + sg.Text(i18n("音调设置")), + sg.Slider( + range=(-24, 24), + key="pitch", + resolution=1, + orientation="h", + default_value=data.get("pitch", ""), + ), + ], + [ + sg.Text(i18n("Index Rate")), + sg.Slider( + range=(0.0, 1.0), + key="index_rate", + resolution=0.01, + orientation="h", + default_value=data.get("index_rate", ""), + ), + ], + [ + sg.Text(i18n("音高算法")), + sg.Radio( + "pm", + "f0method", + key="pm", + default=data.get("pm", "") == True, + ), + sg.Radio( + "harvest", + "f0method", + key="harvest", + default=data.get("harvest", "") == True, + ), + sg.Radio( + "crepe", + "f0method", + key="crepe", + default=data.get("crepe", "") == True, + ), + sg.Radio( + "rmvpe", + "f0method", + key="rmvpe", + default=data.get("rmvpe", "") == True, + ), + ], + ], + title=i18n("常规设置"), + ), + sg.Frame( + layout=[ + [ + sg.Text(i18n("采样长度")), + sg.Slider( + range=(0.12, 2.4), + key="block_time", + resolution=0.03, + orientation="h", + default_value=data.get("block_time", ""), + ), + ], + [ + sg.Text(i18n("harvest进程数")), + sg.Slider( + range=(1, n_cpu), + key="n_cpu", + resolution=1, + orientation="h", + default_value=data.get( + "n_cpu", min(self.config.n_cpu, n_cpu) + ), + ), + ], + [ + sg.Text(i18n("淡入淡出长度")), + sg.Slider( + range=(0.01, 0.15), + key="crossfade_length", + resolution=0.01, + orientation="h", + default_value=data.get("crossfade_length", ""), + ), + ], + [ + sg.Text(i18n("额外推理时长")), + sg.Slider( + range=(0.05, 3.00), + key="extra_time", + resolution=0.01, + orientation="h", + default_value=data.get("extra_time", ""), + ), + ], + [ + sg.Checkbox(i18n("输入降噪"), key="I_noise_reduce"), + sg.Checkbox(i18n("输出降噪"), key="O_noise_reduce"), + ], + ], + title=i18n("性能设置"), + ), + ], + [ + sg.Button(i18n("开始音频转换"), key="start_vc"), + sg.Button(i18n("停止音频转换"), key="stop_vc"), + sg.Text(i18n("推理时间(ms):")), + sg.Text("0", key="infer_time"), + ], + ] + self.window = sg.Window("RVC - GUI", layout=layout) + self.event_handler() + + def event_handler(self): + while True: + event, values = self.window.read() + if event == sg.WINDOW_CLOSED: + self.flag_vc = False + exit() + if event == "start_vc" and self.flag_vc == False: + if self.set_values(values) == True: + print("using_cuda:" + str(torch.cuda.is_available())) + self.start_vc() + settings = { + "pth_path": values["pth_path"], + "index_path": values["index_path"], + "sg_input_device": values["sg_input_device"], + "sg_output_device": values["sg_output_device"], + "threhold": values["threhold"], + "pitch": values["pitch"], + "index_rate": values["index_rate"], + "block_time": values["block_time"], + "crossfade_length": values["crossfade_length"], + "extra_time": values["extra_time"], + "n_cpu": values["n_cpu"], + "f0method": ["pm", "harvest", "crepe", "rmvpe"][ + [ + values["pm"], + values["harvest"], + values["crepe"], + values["rmvpe"], + ].index(True) + ], + } + with open("values1.json", "w") as j: + json.dump(settings, j) + if event == "stop_vc" and self.flag_vc == True: + self.flag_vc = False + + def set_values(self, values): + if len(values["pth_path"].strip()) == 0: + sg.popup(i18n("请选择pth文件")) + return False + if len(values["index_path"].strip()) == 0: + sg.popup(i18n("请选择index文件")) + return False + pattern = re.compile("[^\x00-\x7F]+") + if pattern.findall(values["pth_path"]): + sg.popup(i18n("pth文件路径不可包含中文")) + return False + if pattern.findall(values["index_path"]): + sg.popup(i18n("index文件路径不可包含中文")) + return False + self.set_devices(values["sg_input_device"], values["sg_output_device"]) + self.config.pth_path = values["pth_path"] + self.config.index_path = values["index_path"] + self.config.threhold = values["threhold"] + self.config.pitch = values["pitch"] + self.config.block_time = values["block_time"] + self.config.crossfade_time = values["crossfade_length"] + self.config.extra_time = values["extra_time"] + self.config.I_noise_reduce = values["I_noise_reduce"] + self.config.O_noise_reduce = values["O_noise_reduce"] + self.config.index_rate = values["index_rate"] + self.config.n_cpu = values["n_cpu"] + self.config.f0method = ["pm", "harvest", "crepe", "rmvpe"][ + [ + values["pm"], + values["harvest"], + values["crepe"], + values["rmvpe"], + ].index(True) + ] + return True + + def start_vc(self): + torch.cuda.empty_cache() + self.flag_vc = True + self.rvc = RVC( + self.config.pitch, + self.config.pth_path, + self.config.index_path, + self.config.index_rate, + self.config.n_cpu, + inp_q, + opt_q, + device, + ) + self.config.samplerate = self.rvc.tgt_sr + self.config.crossfade_time = min( + self.config.crossfade_time, self.config.block_time + ) + self.block_frame = int(self.config.block_time * self.config.samplerate) + self.crossfade_frame = int( + self.config.crossfade_time * self.config.samplerate + ) + self.sola_search_frame = int(0.01 * self.config.samplerate) + self.extra_frame = int(self.config.extra_time * self.config.samplerate) + self.zc = self.rvc.tgt_sr // 100 + self.input_wav: np.ndarray = np.zeros( + int( + np.ceil( + ( + self.extra_frame + + self.crossfade_frame + + self.sola_search_frame + + self.block_frame + ) + / self.zc + ) + * self.zc + ), + dtype="float32", + ) + self.output_wav_cache: torch.Tensor = torch.zeros( + int( + np.ceil( + ( + self.extra_frame + + self.crossfade_frame + + self.sola_search_frame + + self.block_frame + ) + / self.zc + ) + * self.zc + ), + device=device, + dtype=torch.float32, + ) + self.pitch: np.ndarray = np.zeros( + self.input_wav.shape[0] // self.zc, + dtype="int32", + ) + self.pitchf: np.ndarray = np.zeros( + self.input_wav.shape[0] // self.zc, + dtype="float64", + ) + self.output_wav: torch.Tensor = torch.zeros( + self.block_frame, device=device, dtype=torch.float32 + ) + self.sola_buffer: torch.Tensor = torch.zeros( + self.crossfade_frame, device=device, dtype=torch.float32 + ) + self.fade_in_window: torch.Tensor = torch.linspace( + 0.0, 1.0, steps=self.crossfade_frame, device=device, dtype=torch.float32 + ) + self.fade_out_window: torch.Tensor = 1 - self.fade_in_window + self.resampler = tat.Resample( + orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32 + ).to(device) + thread_vc = threading.Thread(target=self.soundinput) + thread_vc.start() + + def soundinput(self): + """ + 接受音频输入 + """ + channels = 1 if sys.platform == "darwin" else 2 + with sd.Stream( + channels=channels, + callback=self.audio_callback, + blocksize=self.block_frame, + samplerate=self.config.samplerate, + dtype="float32", + ): + while self.flag_vc: + time.sleep(self.config.block_time) + print("Audio block passed.") + print("ENDing VC") + + def audio_callback( + self, indata: np.ndarray, outdata: np.ndarray, frames, times, status + ): + """ + 音频处理 + """ + start_time = time.perf_counter() + indata = librosa.to_mono(indata.T) + if self.config.I_noise_reduce: + indata[:] = nr.reduce_noise(y=indata, sr=self.config.samplerate) + """noise gate""" + frame_length = 2048 + hop_length = 1024 + rms = librosa.feature.rms( + y=indata, frame_length=frame_length, hop_length=hop_length + ) + if self.config.threhold > -60: + db_threhold = ( + librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold + ) + for i in range(db_threhold.shape[0]): + if db_threhold[i]: + indata[i * hop_length : (i + 1) * hop_length] = 0 + self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata) + # infer + inp = torch.from_numpy(self.input_wav).to(device) + ##0 + res1 = self.resampler(inp) + ###55% + rate1 = self.block_frame / ( + self.extra_frame + + self.crossfade_frame + + self.sola_search_frame + + self.block_frame + ) + rate2 = ( + self.crossfade_frame + self.sola_search_frame + self.block_frame + ) / ( + self.extra_frame + + self.crossfade_frame + + self.sola_search_frame + + self.block_frame + ) + res2 = self.rvc.infer( + res1, + res1[-self.block_frame :].cpu().numpy(), + rate1, + rate2, + self.pitch, + self.pitchf, + self.config.f0method, + ) + self.output_wav_cache[-res2.shape[0] :] = res2 + infer_wav = self.output_wav_cache[ + -self.crossfade_frame - self.sola_search_frame - self.block_frame : + ] + # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC + cor_nom = F.conv1d( + infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame], + self.sola_buffer[None, None, :], + ) + cor_den = torch.sqrt( + F.conv1d( + infer_wav[ + None, None, : self.crossfade_frame + self.sola_search_frame + ] + ** 2, + torch.ones(1, 1, self.crossfade_frame, device=device), + ) + + 1e-8 + ) + if sys.platform == "darwin": + cor_nom = cor_nom.cpu() + cor_den = cor_den.cpu() + sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) + print("sola offset: " + str(int(sola_offset))) + self.output_wav[:] = infer_wav[sola_offset : sola_offset + self.block_frame] + self.output_wav[: self.crossfade_frame] *= self.fade_in_window + self.output_wav[: self.crossfade_frame] += self.sola_buffer[:] + # crossfade + if sola_offset < self.sola_search_frame: + self.sola_buffer[:] = ( + infer_wav[ + -self.sola_search_frame + - self.crossfade_frame + + sola_offset : -self.sola_search_frame + + sola_offset + ] + * self.fade_out_window + ) + else: + self.sola_buffer[:] = ( + infer_wav[-self.crossfade_frame :] * self.fade_out_window + ) + if self.config.O_noise_reduce: + if sys.platform == "darwin": + noise_reduced_signal = nr.reduce_noise( + y=self.output_wav[:].cpu().numpy(), sr=self.config.samplerate + ) + outdata[:] = noise_reduced_signal[:, np.newaxis] + else: + outdata[:] = np.tile( + nr.reduce_noise( + y=self.output_wav[:].cpu().numpy(), + sr=self.config.samplerate, + ), + (2, 1), + ).T + else: + if sys.platform == "darwin": + outdata[:] = self.output_wav[:].cpu().numpy()[:, np.newaxis] + else: + outdata[:] = self.output_wav[:].repeat(2, 1).t().cpu().numpy() + total_time = time.perf_counter() - start_time + self.window["infer_time"].update(int(total_time * 1000)) + print("infer time:" + str(total_time)) + + def get_devices(self, update: bool = True): + """获取设备列表""" + if update: + sd._terminate() + sd._initialize() + devices = sd.query_devices() + hostapis = sd.query_hostapis() + for hostapi in hostapis: + for device_idx in hostapi["devices"]: + devices[device_idx]["hostapi_name"] = hostapi["name"] + input_devices = [ + f"{d['name']} ({d['hostapi_name']})" + for d in devices + if d["max_input_channels"] > 0 + ] + output_devices = [ + f"{d['name']} ({d['hostapi_name']})" + for d in devices + if d["max_output_channels"] > 0 + ] + input_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_input_channels"] > 0 + ] + output_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_output_channels"] > 0 + ] + return ( + input_devices, + output_devices, + input_devices_indices, + output_devices_indices, + ) + + def set_devices(self, input_device, output_device): + """设置输出设备""" + ( + input_devices, + output_devices, + input_device_indices, + output_device_indices, + ) = self.get_devices() + sd.default.device[0] = input_device_indices[ + input_devices.index(input_device) + ] + sd.default.device[1] = output_device_indices[ + output_devices.index(output_device) + ] + print("input device:" + str(sd.default.device[0]) + ":" + str(input_device)) + print( + "output device:" + str(sd.default.device[1]) + ":" + str(output_device) + ) + + gui = GUI() diff --git a/guidml.py b/guidml.py new file mode 100644 index 0000000..aadf22d --- /dev/null +++ b/guidml.py @@ -0,0 +1,710 @@ +""" +0416后的更新: + 引入config中half + 重建npy而不用填写 + v2支持 + 无f0模型支持 + 修复 + + int16: + 增加无索引支持 + f0算法改harvest(怎么看就只有这个会影响CPU占用),但是不这么改效果不好 +""" +import os, sys, traceback, re + +import json + +now_dir = os.getcwd() +sys.path.append(now_dir) +from config import Config + +Config = Config() + +import torch_directml +import PySimpleGUI as sg +import sounddevice as sd +import noisereduce as nr +import numpy as np +from fairseq import checkpoint_utils +import librosa, torch, pyworld, faiss, time, threading +import torch.nn.functional as F +import torchaudio.transforms as tat +import scipy.signal as signal + + +# import matplotlib.pyplot as plt +from lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +from i18n import I18nAuto + +i18n = I18nAuto() +device = torch_directml.device(torch_directml.default_device()) +current_dir = os.getcwd() + + +class RVC: + def __init__( + self, key, hubert_path, pth_path, index_path, npy_path, index_rate + ) -> None: + """ + 初始化 + """ + try: + self.f0_up_key = key + self.time_step = 160 / 16000 * 1000 + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.sr = 16000 + self.window = 160 + if index_rate != 0: + self.index = faiss.read_index(index_path) + # self.big_npy = np.load(npy_path) + self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) + print("index search enabled") + self.index_rate = index_rate + model_path = hubert_path + print("load model(s) from {}".format(model_path)) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [model_path], + suffix="", + ) + self.model = models[0] + self.model = self.model.to(device) + if Config.is_half: + self.model = self.model.half() + else: + self.model = self.model.float() + self.model.eval() + cpt = torch.load(pth_path, map_location="cpu") + self.tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + self.if_f0 = cpt.get("f0", 1) + self.version = cpt.get("version", "v1") + if self.version == "v1": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs256NSFsid( + *cpt["config"], is_half=Config.is_half + ) + else: + self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif self.version == "v2": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs768NSFsid( + *cpt["config"], is_half=Config.is_half + ) + else: + self.net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) + del self.net_g.enc_q + print(self.net_g.load_state_dict(cpt["weight"], strict=False)) + self.net_g.eval().to(device) + if Config.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + except: + print(traceback.format_exc()) + + def get_f0(self, x, f0_up_key, inp_f0=None): + x_pad = 1 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0, t = pyworld.harvest( + x.astype(np.double), + fs=self.sr, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=10, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) + f0 = signal.medfilt(f0, 3) + f0 *= pow(2, f0_up_key / 12) + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0] + f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0bak # 1-0 + + def infer(self, feats: torch.Tensor) -> np.ndarray: + """ + 推理函数 + """ + audio = feats.clone().cpu().numpy() + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + if Config.is_half: + feats = feats.half() + else: + feats = feats.float() + inputs = { + "source": feats.to(device), + "padding_mask": padding_mask.to(device), + "output_layer": 9 if self.version == "v1" else 12, + } + torch.cuda.synchronize() + with torch.no_grad(): + logits = self.model.extract_features(**inputs) + feats = ( + self.model.final_proj(logits[0]) if self.version == "v1" else logits[0] + ) + + ####索引优化 + try: + if ( + hasattr(self, "index") + and hasattr(self, "big_npy") + and self.index_rate != 0 + ): + npy = feats[0].cpu().numpy().astype("float32") + score, ix = self.index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + if Config.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(device) * self.index_rate + + (1 - self.index_rate) * feats + ) + else: + print("index search FAIL or disabled") + except: + traceback.print_exc() + print("index search FAIL") + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + torch.cuda.synchronize() + print(feats.shape) + if self.if_f0 == 1: + pitch, pitchf = self.get_f0(audio, self.f0_up_key) + p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存 + else: + pitch, pitchf = None, None + p_len = min(feats.shape[1], 13000) # 太大了爆显存 + torch.cuda.synchronize() + # print(feats.shape,pitch.shape) + feats = feats[:, :p_len, :] + if self.if_f0 == 1: + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) + pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) + p_len = torch.LongTensor([p_len]).to(device) + ii = 0 # sid + sid = torch.LongTensor([ii]).to(device) + with torch.no_grad(): + if self.if_f0 == 1: + infered_audio = ( + self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] + .data.cpu() + .float() + ) + else: + infered_audio = ( + self.net_g.infer(feats, p_len, sid)[0][0, 0].data.cpu().float() + ) + torch.cuda.synchronize() + return infered_audio + + +class GUIConfig: + def __init__(self) -> None: + self.hubert_path: str = "" + self.pth_path: str = "" + self.index_path: str = "" + self.npy_path: str = "" + self.pitch: int = 12 + self.samplerate: int = 44100 + self.block_time: float = 1.0 # s + self.buffer_num: int = 1 + self.threhold: int = -30 + self.crossfade_time: float = 0.08 + self.extra_time: float = 0.04 + self.I_noise_reduce = False + self.O_noise_reduce = False + self.index_rate = 0.3 + + +class GUI: + def __init__(self) -> None: + self.config = GUIConfig() + self.flag_vc = False + + self.launcher() + + def load(self): + ( + input_devices, + output_devices, + input_devices_indices, + output_devices_indices, + ) = self.get_devices() + try: + with open("values1.json", "r") as j: + data = json.load(j) + except: + with open("values1.json", "w") as j: + data = { + "pth_path": "", + "index_path": "", + "sg_input_device": input_devices[ + input_devices_indices.index(sd.default.device[0]) + ], + "sg_output_device": output_devices[ + output_devices_indices.index(sd.default.device[1]) + ], + "threhold": "-45", + "pitch": "0", + "index_rate": "0", + "block_time": "1", + "crossfade_length": "0.04", + "extra_time": "1", + } + return data + + def launcher(self): + data = self.load() + sg.theme("LightBlue3") + input_devices, output_devices, _, _ = self.get_devices() + layout = [ + [ + sg.Frame( + title=i18n("加载模型"), + layout=[ + [ + sg.Input( + default_text="hubert_base.pt", + key="hubert_path", + disabled=True, + ), + sg.FileBrowse( + i18n("Hubert模型"), + initial_folder=os.path.join(os.getcwd()), + file_types=(("pt files", "*.pt"),), + ), + ], + [ + sg.Input( + default_text=data.get("pth_path", ""), + key="pth_path", + ), + sg.FileBrowse( + i18n("选择.pth文件"), + initial_folder=os.path.join(os.getcwd(), "weights"), + file_types=(("weight files", "*.pth"),), + ), + ], + [ + sg.Input( + default_text=data.get("index_path", ""), + key="index_path", + ), + sg.FileBrowse( + i18n("选择.index文件"), + initial_folder=os.path.join(os.getcwd(), "logs"), + file_types=(("index files", "*.index"),), + ), + ], + [ + sg.Input( + default_text="你不需要填写这个You don't need write this.", + key="npy_path", + disabled=True, + ), + sg.FileBrowse( + i18n("选择.npy文件"), + initial_folder=os.path.join(os.getcwd(), "logs"), + file_types=(("feature files", "*.npy"),), + ), + ], + ], + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("输入设备")), + sg.Combo( + input_devices, + key="sg_input_device", + default_value=data.get("sg_input_device", ""), + ), + ], + [ + sg.Text(i18n("输出设备")), + sg.Combo( + output_devices, + key="sg_output_device", + default_value=data.get("sg_output_device", ""), + ), + ], + ], + title=i18n("音频设备(请使用同种类驱动)"), + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("响应阈值")), + sg.Slider( + range=(-60, 0), + key="threhold", + resolution=1, + orientation="h", + default_value=data.get("threhold", ""), + ), + ], + [ + sg.Text(i18n("音调设置")), + sg.Slider( + range=(-24, 24), + key="pitch", + resolution=1, + orientation="h", + default_value=data.get("pitch", ""), + ), + ], + [ + sg.Text(i18n("Index Rate")), + sg.Slider( + range=(0.0, 1.0), + key="index_rate", + resolution=0.01, + orientation="h", + default_value=data.get("index_rate", ""), + ), + ], + ], + title=i18n("常规设置"), + ), + sg.Frame( + layout=[ + [ + sg.Text(i18n("采样长度")), + sg.Slider( + range=(0.1, 3.0), + key="block_time", + resolution=0.1, + orientation="h", + default_value=data.get("block_time", ""), + ), + ], + [ + sg.Text(i18n("淡入淡出长度")), + sg.Slider( + range=(0.01, 0.15), + key="crossfade_length", + resolution=0.01, + orientation="h", + default_value=data.get("crossfade_length", ""), + ), + ], + [ + sg.Text(i18n("额外推理时长")), + sg.Slider( + range=(0.05, 3.00), + key="extra_time", + resolution=0.01, + orientation="h", + default_value=data.get("extra_time", ""), + ), + ], + [ + sg.Checkbox(i18n("输入降噪"), key="I_noise_reduce"), + sg.Checkbox(i18n("输出降噪"), key="O_noise_reduce"), + ], + ], + title=i18n("性能设置"), + ), + ], + [ + sg.Button(i18n("开始音频转换"), key="start_vc"), + sg.Button(i18n("停止音频转换"), key="stop_vc"), + sg.Text(i18n("推理时间(ms):")), + sg.Text("0", key="infer_time"), + ], + ] + self.window = sg.Window("RVC - GUI", layout=layout) + self.event_handler() + + def event_handler(self): + while True: + event, values = self.window.read() + if event == sg.WINDOW_CLOSED: + self.flag_vc = False + exit() + if event == "start_vc" and self.flag_vc == False: + if self.set_values(values) == True: + print("using_cuda:" + str(torch.cuda.is_available())) + self.start_vc() + settings = { + "pth_path": values["pth_path"], + "index_path": values["index_path"], + "sg_input_device": values["sg_input_device"], + "sg_output_device": values["sg_output_device"], + "threhold": values["threhold"], + "pitch": values["pitch"], + "index_rate": values["index_rate"], + "block_time": values["block_time"], + "crossfade_length": values["crossfade_length"], + "extra_time": values["extra_time"], + } + with open("values1.json", "w") as j: + json.dump(settings, j) + if event == "stop_vc" and self.flag_vc == True: + self.flag_vc = False + + def set_values(self, values): + if len(values["pth_path"].strip()) == 0: + sg.popup(i18n("请选择pth文件")) + return False + if len(values["index_path"].strip()) == 0: + sg.popup(i18n("请选择index文件")) + return False + pattern = re.compile("[^\x00-\x7F]+") + if pattern.findall(values["hubert_path"]): + sg.popup(i18n("hubert模型路径不可包含中文")) + return False + if pattern.findall(values["pth_path"]): + sg.popup(i18n("pth文件路径不可包含中文")) + return False + if pattern.findall(values["index_path"]): + sg.popup(i18n("index文件路径不可包含中文")) + return False + self.set_devices(values["sg_input_device"], values["sg_output_device"]) + self.config.hubert_path = os.path.join(current_dir, "hubert_base.pt") + self.config.pth_path = values["pth_path"] + self.config.index_path = values["index_path"] + self.config.npy_path = values["npy_path"] + self.config.threhold = values["threhold"] + self.config.pitch = values["pitch"] + self.config.block_time = values["block_time"] + self.config.crossfade_time = values["crossfade_length"] + self.config.extra_time = values["extra_time"] + self.config.I_noise_reduce = values["I_noise_reduce"] + self.config.O_noise_reduce = values["O_noise_reduce"] + self.config.index_rate = values["index_rate"] + return True + + def start_vc(self): + torch.cuda.empty_cache() + self.flag_vc = True + self.block_frame = int(self.config.block_time * self.config.samplerate) + self.crossfade_frame = int(self.config.crossfade_time * self.config.samplerate) + self.sola_search_frame = int(0.012 * self.config.samplerate) + self.delay_frame = int(0.01 * self.config.samplerate) # 往前预留0.02s + self.extra_frame = int(self.config.extra_time * self.config.samplerate) + self.rvc = None + self.rvc = RVC( + self.config.pitch, + self.config.hubert_path, + self.config.pth_path, + self.config.index_path, + self.config.npy_path, + self.config.index_rate, + ) + self.input_wav: np.ndarray = np.zeros( + self.extra_frame + + self.crossfade_frame + + self.sola_search_frame + + self.block_frame, + dtype="float32", + ) + self.output_wav: torch.Tensor = torch.zeros( + self.block_frame, device=device, dtype=torch.float32 + ) + self.sola_buffer: torch.Tensor = torch.zeros( + self.crossfade_frame, device=device, dtype=torch.float32 + ) + self.fade_in_window: torch.Tensor = torch.linspace( + 0.0, 1.0, steps=self.crossfade_frame, device=device, dtype=torch.float32 + ) + self.fade_out_window: torch.Tensor = 1 - self.fade_in_window + self.resampler1 = tat.Resample( + orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32 + ) + self.resampler2 = tat.Resample( + orig_freq=self.rvc.tgt_sr, + new_freq=self.config.samplerate, + dtype=torch.float32, + ) + thread_vc = threading.Thread(target=self.soundinput) + thread_vc.start() + + def soundinput(self): + """ + 接受音频输入 + """ + with sd.Stream( + channels=2, + callback=self.audio_callback, + blocksize=self.block_frame, + samplerate=self.config.samplerate, + dtype="float32", + ): + while self.flag_vc: + time.sleep(self.config.block_time) + print("Audio block passed.") + print("ENDing VC") + + def audio_callback( + self, indata: np.ndarray, outdata: np.ndarray, frames, times, status + ): + """ + 音频处理 + """ + start_time = time.perf_counter() + indata = librosa.to_mono(indata.T) + if self.config.I_noise_reduce: + indata[:] = nr.reduce_noise(y=indata, sr=self.config.samplerate) + + """noise gate""" + frame_length = 2048 + hop_length = 1024 + rms = librosa.feature.rms( + y=indata, frame_length=frame_length, hop_length=hop_length + ) + db_threhold = librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold + # print(rms.shape,db.shape,db) + for i in range(db_threhold.shape[0]): + if db_threhold[i]: + indata[i * hop_length : (i + 1) * hop_length] = 0 + self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata) + + # infer + print("input_wav:" + str(self.input_wav.shape)) + # print('infered_wav:'+str(infer_wav.shape)) + infer_wav: torch.Tensor = self.resampler2( + self.rvc.infer(self.resampler1(torch.from_numpy(self.input_wav))) + )[-self.crossfade_frame - self.sola_search_frame - self.block_frame :].to( + device + ) + print("infer_wav:" + str(infer_wav.shape)) + + # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC + cor_nom = F.conv1d( + infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame], + self.sola_buffer[None, None, :], + ) + cor_den = torch.sqrt( + F.conv1d( + infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame] + ** 2, + torch.ones(1, 1, self.crossfade_frame, device=device), + ) + + 1e-8 + ) + sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) + print("sola offset: " + str(int(sola_offset))) + + # crossfade + self.output_wav[:] = infer_wav[sola_offset : sola_offset + self.block_frame] + self.output_wav[: self.crossfade_frame] *= self.fade_in_window + self.output_wav[: self.crossfade_frame] += self.sola_buffer[:] + if sola_offset < self.sola_search_frame: + self.sola_buffer[:] = ( + infer_wav[ + -self.sola_search_frame + - self.crossfade_frame + + sola_offset : -self.sola_search_frame + + sola_offset + ] + * self.fade_out_window + ) + else: + self.sola_buffer[:] = ( + infer_wav[-self.crossfade_frame :] * self.fade_out_window + ) + + if self.config.O_noise_reduce: + outdata[:] = np.tile( + nr.reduce_noise( + y=self.output_wav[:].cpu().numpy(), sr=self.config.samplerate + ), + (2, 1), + ).T + else: + outdata[:] = self.output_wav[:].repeat(2, 1).t().cpu().numpy() + total_time = time.perf_counter() - start_time + self.window["infer_time"].update(int(total_time * 1000)) + print("infer time:" + str(total_time)) + + def get_devices(self, update: bool = True): + """获取设备列表""" + if update: + sd._terminate() + sd._initialize() + devices = sd.query_devices() + hostapis = sd.query_hostapis() + for hostapi in hostapis: + for device_idx in hostapi["devices"]: + devices[device_idx]["hostapi_name"] = hostapi["name"] + input_devices = [ + f"{d['name']} ({d['hostapi_name']})" + for d in devices + if d["max_input_channels"] > 0 + ] + output_devices = [ + f"{d['name']} ({d['hostapi_name']})" + for d in devices + if d["max_output_channels"] > 0 + ] + input_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_input_channels"] > 0 + ] + output_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_output_channels"] > 0 + ] + return ( + input_devices, + output_devices, + input_devices_indices, + output_devices_indices, + ) + + def set_devices(self, input_device, output_device): + """设置输出设备""" + ( + input_devices, + output_devices, + input_device_indices, + output_device_indices, + ) = self.get_devices() + sd.default.device[0] = input_device_indices[input_devices.index(input_device)] + sd.default.device[1] = output_device_indices[ + output_devices.index(output_device) + ] + print("input device:" + str(sd.default.device[0]) + ":" + str(input_device)) + print("output device:" + str(sd.default.device[1]) + ":" + str(output_device)) + + +gui = GUI() diff --git a/i18n/es_ES.json b/i18n/es_ES.json index 3c0b8a3..4e5d2bf 100644 --- a/i18n/es_ES.json +++ b/i18n/es_ES.json @@ -7,7 +7,7 @@ "step3a:正在训练模型": "Paso 3a: Entrenando el modelo", "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Entrenamiento finalizado, puede ver el registro de entrenamiento en la consola o en el archivo train.log en la carpeta del experimento", "全流程结束!": "¡Todo el proceso ha terminado!", - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt.": "Este software es de código abierto bajo la licencia MIT, el autor no tiene ningún control sobre el software, y aquellos que usan el software y difunden los sonidos exportados por el software son los únicos responsables.
Si no está de acuerdo con esta cláusula , no puede utilizar ni citar ningún código ni archivo del paquete de software Consulte el directorio raíz Agreement-LICENSE.txt para obtener más información.", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Este software es de código abierto bajo la licencia MIT, el autor no tiene ningún control sobre el software, y aquellos que usan el software y difunden los sonidos exportados por el software son los únicos responsables.
Si no está de acuerdo con esta cláusula , no puede utilizar ni citar ningún código ni archivo del paquete de software Consulte el directorio raíz Agreement-LICENSE.txt para obtener más información.", "模型推理": "inferencia del modelo", "推理音色": "inferencia de voz", "刷新音色列表和索引路径": "Actualizar la lista de timbres e índice de rutas", @@ -37,6 +37,7 @@ "也可批量输入音频文件, 二选一, 优先读文件夹": "También se pueden ingresar múltiples archivos de audio, cualquiera de las dos opciones, con prioridad dada a la carpeta", "导出文件格式": "Formato de archivo de exportación", "伴奏人声分离&去混响&去回声": "Separación de voz acompañante & eliminación de reverberación & eco", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Procesamiento por lotes para la separación de acompañamiento vocal utilizando el modelo UVR5.
Ejemplo de formato de ruta de carpeta válido: D:\\ruta\\a\\la\\carpeta\\de\\entrada (copiar desde la barra de direcciones del administrador de archivos).
El modelo se divide en tres categorías:
1. Preservar voces: Elija esta opción para audio sin armonías. Preserva las voces mejor que HP5. Incluye dos modelos incorporados: HP2 y HP3. HP3 puede filtrar ligeramente el acompañamiento pero conserva las voces un poco mejor que HP2.
2. Preservar solo voces principales: Elija esta opción para audio con armonías. Puede debilitar las voces principales. Incluye un modelo incorporado: HP5.
3. Modelos de des-reverberación y des-retardo (por FoxJoy):
  (1) MDX-Net: La mejor opción para la eliminación de reverberación estéreo pero no puede eliminar la reverberación mono;
 (234) DeEcho: Elimina efectos de retardo. El modo Agresivo elimina más a fondo que el modo Normal. DeReverb adicionalmente elimina la reverberación y puede eliminar la reverberación mono, pero no muy efectivamente para contenido de alta frecuencia fuertemente reverberado.
Notas de des-reverberación/des-retardo:
1. El tiempo de procesamiento para el modelo DeEcho-DeReverb es aproximadamente el doble que los otros dos modelos DeEcho.
2. El modelo MDX-Net-Dereverb es bastante lento.
3. La configuración más limpia recomendada es aplicar primero MDX-Net y luego DeEcho-Agresivo.", "输入待处理音频文件夹路径": "Ingrese la ruta a la carpeta de audio que se procesará", "模型": "Modelo", "指定输出主人声文件夹": "Especifique la carpeta de salida para la voz principal", @@ -94,7 +95,6 @@ "Onnx导出": "Exportar Onnx", "RVC模型路径": "Ruta del modelo RVC", "Onnx输出路径": "Ruta de salida Onnx", - "MoeVS模型": "Modelo MoeVS", "导出Onnx模型": "Exportar modelo Onnx", "常见问题解答": "Preguntas frecuentes", "招募音高曲线前端编辑器": "Reclutar editores front-end para curvas de tono", @@ -122,5 +122,11 @@ "开始音频转换": "Iniciar conversión de audio", "停止音频转换": "Detener la conversión de audio", "推理时间(ms):": "Inferir tiempo (ms):", - "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。":"Procesamiento por lotes para la separación de acompañamiento vocal utilizando el modelo UVR5.
Ejemplo de formato de ruta de carpeta válido: D:\\ruta\\a\\la\\carpeta\\de\\entrada (copiar desde la barra de direcciones del administrador de archivos).
El modelo se divide en tres categorías:
1. Preservar voces: Elija esta opción para audio sin armonías. Preserva las voces mejor que HP5. Incluye dos modelos incorporados: HP2 y HP3. HP3 puede filtrar ligeramente el acompañamiento pero conserva las voces un poco mejor que HP2.
2. Preservar solo voces principales: Elija esta opción para audio con armonías. Puede debilitar las voces principales. Incluye un modelo incorporado: HP5.
3. Modelos de des-reverberación y des-retardo (por FoxJoy):
  (1) MDX-Net: La mejor opción para la eliminación de reverberación estéreo pero no puede eliminar la reverberación mono;
 (234) DeEcho: Elimina efectos de retardo. El modo Agresivo elimina más a fondo que el modo Normal. DeReverb adicionalmente elimina la reverberación y puede eliminar la reverberación mono, pero no muy efectivamente para contenido de alta frecuencia fuertemente reverberado.
Notas de des-reverberación/des-retardo:
1. El tiempo de procesamiento para el modelo DeEcho-DeReverb es aproximadamente el doble que los otros dos modelos DeEcho.
2. El modelo MDX-Net-Dereverb es bastante lento.
3. La configuración más limpia recomendada es aplicar primero MDX-Net y luego DeEcho-Agresivo." + "请选择pth文件": "请选择pth文件", + "请选择index文件": "请选择index文件", + "hubert模型路径不可包含中文": "hubert模型路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "音高算法": "音高算法", + "harvest进程数": "harvest进程数" } diff --git a/i18n/it_IT.json b/i18n/it_IT.json new file mode 100644 index 0000000..b314f31 --- /dev/null +++ b/i18n/it_IT.json @@ -0,0 +1,130 @@ +{ + "很遗憾您这没有能用的显卡来支持您训练": "Sfortunatamente, non è disponibile alcuna GPU compatibile per supportare l'addestramento.", + "是": "SÌ", + "step1:正在处理数据": "Passaggio 1: elaborazione dei dati", + "step2a:无需提取音高": "Step 2a: Saltare l'estrazione del tono", + "step2b:正在提取特征": "Passaggio 2b: estrazione delle funzionalità", + "step3a:正在训练模型": "Passaggio 3a: è iniziato l'addestramento del modello", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Addestramento completato. ", + "全流程结束!": "Tutti i processi sono stati completati!", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Questo software è open source con licenza MIT.
Se non si accetta questa clausola, non è possibile utilizzare o fare riferimento a codici e file all'interno del pacchetto software. Contratto-LICENZA.txt per dettagli.", + "模型推理": "Inferenza del modello", + "推理音色": "Voce di inferenza:", + "刷新音色列表和索引路径": "Aggiorna l'elenco delle voci e il percorso dell'indice", + "卸载音色省显存": "Scarica la voce per risparmiare memoria della GPU:", + "请选择说话人id": "Seleziona ID locutore/cantante:", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Tonalità +12 consigliata per la conversione da maschio a femmina e tonalità -12 per la conversione da femmina a maschio. ", + "变调(整数, 半音数量, 升八度12降八度-12)": "Trasposizione (numero intero, numero di semitoni, alza di un'ottava: 12, abbassa di un'ottava: -12):", + "输入待处理音频文件路径(默认是正确格式示例)": "Immettere il percorso del file audio da elaborare (l'impostazione predefinita è l'esempio di formato corretto):", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Seleziona l'algoritmo di estrazione del tono (\"pm\": estrazione più veloce ma risultato di qualità inferiore; \"harvest\": bassi migliori ma estremamente lenti; \"crepe\": qualità migliore ma utilizzo intensivo della GPU):", + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Se >=3: applica il filtro mediano ai risultati del pitch raccolto. ", + "特征检索库文件路径,为空则使用下拉的选择结果": "Percorso del file di indice delle caratteristiche. ", + "自动检测index路径,下拉式选择(dropdown)": "Rileva automaticamente il percorso dell'indice e seleziona dal menu a tendina:", + "特征文件路径": "Percorso del file delle caratteristiche:", + "检索特征占比": "Rapporto funzionalità di ricerca (controlla la forza dell'accento, troppo alto ha artefatti):", + "后处理重采样至最终采样率,0为不进行重采样": "Ricampiona l'audio di output in post-elaborazione alla frequenza di campionamento finale. ", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Regola il ridimensionamento dell'inviluppo del volume. ", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Proteggi le consonanti senza voce e i suoni del respiro per evitare artefatti come il tearing nella musica elettronica. ", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "File curva F0 (opzionale). ", + "转换": "Convertire", + "输出信息": "Informazioni sull'uscita", + "输出音频(右下角三个点,点了可以下载)": "Esporta audio (clicca sui tre puntini in basso a destra per scaricarlo)", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversione massiva. Inserisci il percorso della cartella che contiene i file da convertire o carica più file audio. I file convertiti finiranno nella cartella specificata. (default: opt) ", + "指定输出文件夹": "Specifica la cartella di output:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Immettere il percorso della cartella audio da elaborare (copiarlo dalla barra degli indirizzi del file manager):", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Puoi anche inserire file audio in massa. ", + "导出文件格式": "Formato file di esportazione", + "伴奏人声分离&去混响&去回声": "Separazione voce/accompagnamento", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Elaborazione batch per la separazione dell'accompagnamento vocale utilizzando il modello UVR5.
Esempio di un formato di percorso di cartella valido: D:\\path\\to\\input\\folder (copialo dalla barra degli indirizzi del file manager).
Il modello è suddiviso in tre categorie:
1. Conserva la voce: scegli questa opzione per l'audio senza armonie.
2. Mantieni solo la voce principale: scegli questa opzione per l'audio con armonie.
3. Modelli di de-riverbero e de-delay (di FoxJoy):
  (1) MDX-Net: la scelta migliore per la rimozione del riverbero stereo ma non può rimuovere il riverbero mono;

Note di de-riverbero/de-delay:
1. Il tempo di elaborazione per il modello DeEcho-DeReverb è circa il doppio rispetto agli altri due modelli DeEcho.
2. Il modello MDX-Net-Dereverb è piuttosto lento.
3. La configurazione più pulita consigliata consiste nell'applicare prima MDX-Net e poi DeEcho-Aggressive.", + "输入待处理音频文件夹路径": "Immettere il percorso della cartella audio da elaborare:", + "模型": "Modello", + "指定输出主人声文件夹": "Specifica la cartella di output per le voci:", + "指定输出非主人声文件夹": "Specificare la cartella di output per l'accompagnamento:", + "训练": "Addestramento", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Passaggio 1: compilare la configurazione sperimentale. ", + "输入实验名": "Inserisci il nome dell'esperimento:", + "目标采样率": "Frequenza di campionamento target:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Se il modello ha una guida del tono (necessario per il canto, facoltativo per il parlato):", + "版本": "Versione", + "提取音高和处理数据使用的CPU进程数": "Numero di processi CPU utilizzati per l'estrazione del tono e l'elaborazione dei dati:", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Passaggio 2a: attraversa automaticamente tutti i file nella cartella di addestramento che possono essere decodificati in audio ed esegui la normalizzazione delle sezioni. ", + "输入训练文件夹路径": "Inserisci il percorso della cartella di addestramento:", + "请指定说话人id": "Si prega di specificare l'ID del locutore/cantante:", + "处理数据": "Processa dati", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Passaggio 2b: utilizzare la CPU per estrarre il tono (se il modello ha il tono), utilizzare la GPU per estrarre le caratteristiche (selezionare l'indice GPU):", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Inserisci gli indici GPU separati da '-', ad esempio 0-1-2 per utilizzare GPU 0, 1 e 2:", + "显卡信息": "Informazioni GPU", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "Seleziona l'algoritmo di estrazione del tono (\"pm\": estrazione più rapida ma parlato di qualità inferiore; \"dio\": parlato migliorato ma estrazione più lenta; \"harvest\": migliore qualità ma estrazione più lenta):", + "特征提取": "Estrazione delle caratteristiche", + "step3: 填写训练设置, 开始训练模型和索引": "Passaggio 3: compilare le impostazioni di addestramento e avviare l'addestramento del modello e dell'indice", + "保存频率save_every_epoch": "Frequenza di salvataggio (save_every_epoch):", + "总训练轮数total_epoch": "Epoch totali di addestramento (total_epoch):", + "每张显卡的batch_size": "Dimensione batch per GPU:", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Salva solo l'ultimo file '.ckpt' per risparmiare spazio su disco:", + "否": "NO", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Memorizza nella cache tutti i set di addestramento nella memoria della GPU. ", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Salva un piccolo modello finale nella cartella \"weights\" in ogni punto di salvataggio:", + "加载预训练底模G路径": "Carica il percorso G del modello base pre-addestrato:", + "加载预训练底模D路径": "Carica il percorso D del modello base pre-addestrato:", + "训练模型": "Addestra modello", + "训练特征索引": "Addestra indice delle caratteristiche", + "一键训练": "Addestramento con un clic", + "ckpt处理": "Elaborazione ckpt", + "模型融合, 可用于测试音色融合": "Model fusion, può essere utilizzato per testare la fusione timbrica", + "A模型路径": "Percorso per il modello A:", + "B模型路径": "Percorso per il modello B:", + "A模型权重": "Peso (w) per il modello A:", + "模型是否带音高指导": "Se il modello ha una guida del tono:", + "要置入的模型信息": "Informazioni sul modello da posizionare:", + "保存的模型名不带后缀": "Nome del modello salvato (senza estensione):", + "模型版本型号": "Versione dell'architettura del modello:", + "融合": "Fusione", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Modifica le informazioni sul modello (supportato solo per i file di modello di piccole dimensioni estratti dalla cartella 'weights')", + "模型路径": "Percorso al modello:", + "要改的模型信息": "Informazioni sul modello da modificare:", + "保存的文件名, 默认空为和源文件同名": "Salva il nome del file (predefinito: uguale al file di origine):", + "修改": "Modificare", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Visualizza le informazioni sul modello (supportato solo per file di modello piccoli estratti dalla cartella 'weights')", + "查看": "Visualizzazione", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Estrazione del modello (inserire il percorso del modello di file di grandi dimensioni nella cartella \"logs\"). ", + "保存名": "Salva nome:", + "模型是否带音高指导,1是0否": "Se il modello ha una guida del tono (1: sì, 0: no):", + "提取": "Estrai", + "Onnx导出": "Esporta Onnx", + "RVC模型路径": "Percorso modello RVC:", + "Onnx输出路径": "Percorso di esportazione Onnx:", + "导出Onnx模型": "Esporta modello Onnx", + "常见问题解答": "FAQ (Domande frequenti)", + "招募音高曲线前端编辑器": "Reclutamento di redattori front-end per curve di tono", + "加开发群联系我xxxxx": "Unisciti al gruppo di sviluppo e contattami a xxxxx", + "点击查看交流、问题反馈群号": "Fare clic per visualizzare il numero del gruppo di comunicazione e feedback sui problemi", + "xxxxx": "xxxxx", + "加载模型": "Carica modello", + "Hubert模型": "Modello Hubert", + "选择.pth文件": "Seleziona il file .pth", + "选择.index文件": "Seleziona il file .index", + "选择.npy文件": "Seleziona il file .npy", + "输入设备": "Dispositivo di input", + "输出设备": "Dispositivo di uscita", + "音频设备(请使用同种类驱动)": "Dispositivo audio (utilizzare lo stesso tipo di driver)", + "响应阈值": "Soglia di risposta", + "音调设置": "Impostazioni del tono", + "Index Rate": "Tasso di indice", + "常规设置": "Impostazioni generali", + "采样长度": "Lunghezza del campione", + "淡入淡出长度": "Lunghezza dissolvenza", + "额外推理时长": "Tempo di inferenza extra", + "输入降噪": "Riduzione del rumore in ingresso", + "输出降噪": "Riduzione del rumore in uscita", + "性能设置": "Impostazioni delle prestazioni", + "开始音频转换": "Avvia la conversione audio", + "停止音频转换": "Arresta la conversione audio", + "推理时间(ms):": "Tempo di inferenza (ms):", + "请选择pth文件": "请选择pth 文件", + "请选择index文件": "请选择index文件", + "hubert模型路径不可包含中文": "hubert 模型路径不可包含中文", + "pth文件路径不可包含中文": "pth è un'app per il futuro", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "音高算法": "音高算法", + "harvest进程数": "harvest进程数" +} diff --git a/i18n/ru-RU.json b/i18n/ru-RU.json new file mode 100644 index 0000000..5be2521 --- /dev/null +++ b/i18n/ru-RU.json @@ -0,0 +1,130 @@ +{ + "很遗憾您这没有能用的显卡来支持您训练": "К сожалению у вас нету видеокарты, которая поддерживает тренировку модели.", + "是": "Да", + "step1:正在处理数据": "Шаг 1: Переработка данных", + "step2a:无需提取音高": "Шаг 2а: Пропуск вытаскивания тональности", + "step2b:正在提取特征": "Шаг 2б: Вытаскивание черт", + "step3a:正在训练模型": "Шаг 3а: Тренировка модели начата", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Тренировка завершена. Вы можете проверить логи тренировки в консоли или в файле 'train.log' в папке модели.", + "全流程结束!": "Все процессы завершены!", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.", + "模型推理": "Обработка модели", + "推理音色": "Обработка голоса:", + "刷新音色列表和索引路径": "Обновить список голосов и индексов", + "卸载音色省显存": "Выгрузить голос для сохранения памяти видеокарты:", + "请选择说话人id": "Выбери айди голоса:", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Рекомендованно +12 для конвертирования мужского голоса в женский и -12 для конвертирования женского в мужской. Если диапазон голоса слищком велик и голос искажается, значение можно изменить на свой вкус.", + "变调(整数, 半音数量, 升八度12降八度-12)": "Высота голоса (число, полутоны, поднять на октаву: 12, понизить на октаву: -12):", + "输入待处理音频文件路径(默认是正确格式示例)": "Введите путь к аудиофайлу, который хотите переработать (по умолчанию введён правильный формат):", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Выберите алгоритм вытаскивания тональности ('pm': быстрое извлечение но качество речи хуже; 'harvest': бассы лучше но очень медленный; 'crepe': лучшее качество но сильно использует видеокарту):", + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Если больше 3: применить медианную фильтрацию к вытащенным тональностям. Значение контролирует радиус фильтра и может уменьшить излишнее дыхание.", + "特征检索库文件路径,为空则使用下拉的选择结果": "Путь к файлу индекса черт. Оставьте пустым, чтобы использовать выбранный результат из списка:", + "自动检测index路径,下拉式选择(dropdown)": "Автоматически найти путь к индексу и выбрать его из списка:", + "特征文件路径": "Путь к файлу черт:", + "检索特征占比": "Соотношение поиска черт:", + "后处理重采样至最终采样率,0为不进行重采样": "Изменить частоту дискретизации в выходном файле на финальную. Поставьте 0, чтобы ничего не изменялось:", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Использовать громкость входного файла для замены или перемешивания с громкостью выходного файла. Чем ближе соотношение к 1, тем больше используется звука из выходного файла:", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Защитить глухие согласные и звуки дыхания для предотвращения артефактов, например разрывание в электронной музыке. Поставьте на 0.5, чтобы выключить. Уменьшите значение для повышения защиты, но при этом может ухудшиться аккуратность индексирования:", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "Файл дуги F0 (не обязательно). Одна тональность на каждую строчку. Заменяет обычный F0 и модуляцию тональности:", + "转换": "Конвертировать", + "输出信息": "Выходная информация", + "输出音频(右下角三个点,点了可以下载)": "Экспортировать аудиофайл (нажми на три точки в правом нижнем углу для загрузки)", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Конвертировать пачкой. Введите путь к папке, в которой находятся файлы для конвертирования или выложите несколько аудиофайлов. Сконвертированные файлы будут сохранены в указанной папке (по умолчанию 'opt').", + "指定输出文件夹": "Укажите выходную папку:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Введите путь к папке с аудио для переработки:", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Вы также можете выложить аудиофайлы пачкой. Выберите одно из двух. Приоритет отдаётся считыванию из папки.", + "导出文件格式": "Формат выходного файла", + "伴奏人声分离&去混响&去回声": "Отделение вокала/инструментала и убирание эхо", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Пакетная обработка для разделения вокального сопровождения с использованием модели UVR5.
Пример допустимого формата пути к папке: D:\\path\\to\\input\\folder
Модель разделена на три категории:
1. Сохранить вокал: выберите этот вариант для звука без гармоний. Он сохраняет вокал лучше, чем HP5. Он включает в себя две встроенные модели: HP2 и HP3. HP3 может немного пропускать инструментал, но сохраняет вокал немного лучше, чем HP2.
2. Сохранить только основной вокал: выберите этот вариант для звука с гармониями. Это может ослабить основной вокал. Он включает одну встроенную модель: HP5.
3. Модели удаления реверберации и задержки (от FoxJoy):
  (1) MDX-Net: лучший выбор для удаления стереореверберации, но он не может удалить монореверберацию;
 (234) DeEcho: удаляет эффекты задержки. Агрессивный режим удаляет более тщательно, чем Нормальный режим. DeReverb дополнительно удаляет реверберацию и может удалять монореверберацию, но не очень эффективно для сильно реверберированного высокочастотного контента.
Примечания по удалению реверберации/задержки:
1. Время обработки для модели DeEcho-DeReverb примерно в два раза больше, чем для двух других моделей DeEcho.
2. Модель MDX-Net-Dereverb довольно медленная.
3. Рекомендуемая самая чистая конфигурация — сначала применить MDX-Net, а затем DeEcho-Aggressive.", + "输入待处理音频文件夹路径": "Введите путь к папке с аудиофайлами для переработки:", + "模型": "Модели", + "指定输出主人声文件夹": "Введите путь к папке для вокала:", + "指定输出非主人声文件夹": "Введите путь к папке для инструментала:", + "训练": "Тренировка", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Шаг 1: Заполните настройки модели. Данные модели сохранены в папку 'logs' и для каждой модели создаётся отдельная папка. Введите вручную путь к настройкам для модели, в которой находятся логи и тренировочные файлы.", + "输入实验名": "Введите название модели:", + "目标采样率": "Частота дискретизации модели:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Наведение по тональности у модели (обязательно для пения, необязательно для речи):", + "版本": "Версия", + "提取音高和处理数据使用的CPU进程数": "Число процессов ЦП, используемое для вытаскивания тональностей и обрабротки данных:", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Шаг 2а: Автоматически пройтись по всем аудиофайлам в папке тренировки и нормализировать куски. Создаст 2 папки wav в папке модели. В данных момент поддерживается тренировка только одного голоса.", + "输入训练文件夹路径": "Введите путь к папке тренировки:", + "请指定说话人id": "Введите айди голоса:", + "处理数据": "Переработать данные", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Шаг 2б: Вытащить тональности с помошью процессора (если в модели есть тональности), вытащить черты с помощью видеокарты (выберите какой):", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "Введите, какие(-ую) видеокарты(-у) хотите использовать через '-', например 0-1-2, чтобы использовать видеокарту 0, 1 и 2:", + "显卡信息": "Информация о видеокартах", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "Выберите алгоритм вытаскивания тональности ('pm': быстрое извлечение но качество речи хуже; 'harvest': бассы лучше но очень медленный; 'crepe': лучшее качество но сильно использует видеокарту):", + "特征提取": "Вытаскивание черт", + "step3: 填写训练设置, 开始训练模型和索引": "Шаг 3: Заполните остальные настройки тренировки и начните тренировать модель и индекс", + "保存频率save_every_epoch": "Частота сохранения (save_every_epoch):", + "总训练轮数total_epoch": "Полное количество эпох (total_epoch):", + "每张显卡的batch_size": "Размер пачки для видеокарты:", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Сохранять только последний файл '.ckpt', чтобы сохранить место на диске:", + "否": "Нет", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Кэшировать все тренировочные сеты в видеопамять. Кэширование маленький датасетов (меньше 10 минут) может ускорить тренировку, но кэширование больших, наоборот, займёт много видеопамяти и не сильно ускорит тренировку:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Сохранять маленькую финальную модель в папку 'weights' на каждой точке сохранения:", + "加载预训练底模G路径": "Путь к натренированой базовой модели G:", + "加载预训练底模D路径": "Путь к натренированой базовой модели D:", + "训练模型": "Тренировать модель", + "训练特征索引": "Тренировать индекс черт", + "一键训练": "Тренировка одним нажатием", + "ckpt处理": "Обработка ckpt", + "模型融合, 可用于测试音色融合": "Слияние моделей, может быть использовано для проверки слияния тембра", + "A模型路径": "Путь к модели А:", + "B模型路径": "Путь к модели Б:", + "A模型权重": "Вес (w) модели А::", + "模型是否带音高指导": "Есть ли у модели наведение по тональности (1: да, 0: нет):", + "要置入的模型信息": "Информация о модели:", + "保存的模型名不带后缀": "Название сохранённой модели (без расширения):", + "模型版本型号": "Версия архитектуры модели:", + "融合": "Слияние", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Модифицировать информацию о модели (поддерживается только для маленких моделей, взятых из папки 'weights')", + "模型路径": "Путь к папке:", + "要改的模型信息": "Информация о модели, которую нужно модифицировать:", + "保存的文件名, 默认空为和源文件同名": "Название сохранённого файла (по умолчанию такое же, как и входного):", + "修改": "Модифицировать", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Просмотреть информацию о модели (поддерживается только для маленких моделей, взятых из папки 'weights')", + "查看": "Просмотр", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Вытаскивание модели (введите путь к большому файлу модели в папке 'logs'). Полезно, если Вам нужно заверщить тренировку и вручную достать и сохранить маленький файл модели, или если Вам нужно проверить незаконченную модель:", + "保存名": "Имя сохранённого файла:", + "模型是否带音高指导,1是0否": "Есть ли у модели наведение по тональности (1: да, 0: нет):", + "提取": "Вытащить", + "Onnx导出": "Экспортировать Onnx", + "RVC模型路径": "Путь к модели RVC:", + "Onnx输出路径": "Путь для экспотрированного Onnx:", + "导出Onnx模型": "Экспортировать Onnx модель", + "常见问题解答": "ЧаВО (Часто задаваемые вопросы)", + "招募音高曲线前端编辑器": "Использование фронтенд редакторов для тональных дуг", + "加开发群联系我xxxxx": "Присоединитесь к группе разработки и свяжитесь со мной по xxxxx", + "点击查看交流、问题反馈群号": "Нажмите, чтобы просмотреть номер группы коммуникации и отзывах о проблемах", + "xxxxx": "xxxxx", + "加载模型": "Загрузить модель", + "Hubert模型": "Модель Hubert", + "选择.pth文件": "Выбрать файл .pth", + "选择.index文件": "Выбрать файл .index", + "选择.npy文件": "Выбрать файл .npy", + "输入设备": "Входное устройство", + "输出设备": "Выходное устройство", + "音频设备(请使用同种类驱动)": "Аудио устройство (пожалуйста используйте такой=же тип драйвера)", + "响应阈值": "Порог ответа", + "音调设置": "Настройки тональности", + "Index Rate": "Темп индекса", + "常规设置": "Основные настройки", + "采样长度": "Длина сэмпла", + "淡入淡出长度": "Длина затухания", + "额外推理时长": "Доп. время переработки", + "输入降噪": "Уменьшения шума во входной информации", + "输出降噪": "Уменьшения шума во выходной информации", + "性能设置": "Настройки быстроты", + "开始音频转换": "Начать конвертацию аудио", + "停止音频转换": "Закончить конвертацию аудио", + "推理时间(ms):": "Время переработки (мс):", + "请选择pth文件": "请选择pth文件", + "请选择index文件": "请选择index文件", + "hubert模型路径不可包含中文": "hubert模型路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "音高算法": "音高算法", + "harvest进程数": "harvest进程数" +} diff --git a/i18n/tr_TR.json b/i18n/tr_TR.json new file mode 100644 index 0000000..26daae8 --- /dev/null +++ b/i18n/tr_TR.json @@ -0,0 +1,130 @@ +{ + "很遗憾您这没有能用的显卡来支持您训练": "Maalesef, eğitiminizi desteklemek için uyumlu bir GPU bulunmamaktadır.", + "是": "Evet", + "step1:正在处理数据": "Adım 1: Veri işleme", + "step2a:无需提取音高": "Adım 2a: Pitch çıkartma adımını atlama", + "step2b:正在提取特征": "Adım 2b: Özelliklerin çıkarılması", + "step3a:正在训练模型": "Adım 3a: Model eğitimi başladı", + "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Eğitim tamamlandı. Eğitim günlüklerini konsolda veya deney klasörü altındaki train.log dosyasında kontrol edebilirsiniz.", + "全流程结束!": "Tüm işlemler tamamlandı!", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Bu yazılım, MIT lisansı altında açık kaynaklıdır. Yazarın yazılım üzerinde herhangi bir kontrolü yoktur. Yazılımı kullanan ve yazılım tarafından dışa aktarılan sesleri dağıtan kullanıcılar sorumludur.
Eğer bu maddeyle aynı fikirde değilseniz, yazılım paketi içindeki herhangi bir kod veya dosyayı kullanamaz veya referans göremezsiniz. Detaylar için kök dizindeki Agreement-LICENSE.txt dosyasına bakınız.", + "模型推理": "Model çıkartma (Inference)", + "推理音色": "Ses çıkartma (Inference):", + "刷新音色列表和索引路径": "Ses listesini ve indeks yolunu yenile", + "卸载音色省显存": "GPU bellek kullanımını azaltmak için sesi kaldır", + "请选择说话人id": "Konuşmacı/Şarkıcı No seçin:", + "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Erkekten kadına çevirmek için +12 tuş önerilir, kadından erkeğe çevirmek için ise -12 tuş önerilir. Eğer ses aralığı çok fazla genişler ve ses bozulursa, isteğe bağlı olarak uygun aralığa kendiniz de ayarlayabilirsiniz.", + "变调(整数, 半音数量, 升八度12降八度-12)": "Transpoze et (tamsayı, yarıton sayısıyla; bir oktav yükseltmek için: 12, bir oktav düşürmek için: -12):", + "输入待处理音频文件路径(默认是正确格式示例)": "İşlenecek ses dosyasının yolunu girin (varsayılan doğru format örneğidir):", + "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "Pitch algoritmasını seçin ('pm': daha hızlı çıkarır ancak daha düşük kaliteli konuşma; 'harvest': daha iyi konuşma sesi ancak son derece yavaş; 'crepe': daha da iyi kalite ancak GPU yoğunluğu gerektirir):", + ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": "Eğer >=3 ise, elde edilen pitch sonuçlarına median filtreleme uygula. Bu değer, filtre yarıçapını temsil eder ve nefesliliği azaltabilir.", + "特征检索库文件路径,为空则使用下拉的选择结果": "Özellik indeksi dosyasının yolunu belirtin. Seçilen sonucu kullanmak için boş bırakın veya açılır menüden seçim yapın.", + "自动检测index路径,下拉式选择(dropdown)": "İndeks yolunu otomatik olarak tespit et ve açılır menüden seçim yap.", + "特征文件路径": "Özellik dosyasının yolu:", + "检索特征占比": "Arama özelliği oranı (vurgu gücünü kontrol eder, çok yüksek olması sanal etkilere neden olur)", + "后处理重采样至最终采样率,0为不进行重采样": "Son işleme aşamasında çıktı sesini son örnekleme hızına yeniden örnekle. 0 değeri için yeniden örnekleme yapılmaz:", + "输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Sesin hacim zarfını ayarlayın. 0'a yakın değerler, sesin orijinal vokallerin hacmine benzer olmasını sağlar. Düşük bir değerle ses gürültüsünü maskeleyebilir ve hacmi daha doğal bir şekilde duyulabilir hale getirebilirsiniz. 1'e yaklaştıkça sürekli bir yüksek ses seviyesi elde edilir:", + "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果": "Sessiz ünsüzleri ve nefes seslerini koruyarak elektronik müzikte yırtılma gibi sanal hataların oluşmasını engeller. 0.5 olarak ayarlandığında devre dışı kalır. Değerin azaltılması korumayı artırabilir, ancak indeksleme doğruluğunu azaltabilir:", + "F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调": "F0 eğrisi dosyası (isteğe bağlı). Her satırda bir pitch değeri bulunur. Varsayılan F0 ve pitch modülasyonunu değiştirir:", + "转换": "Dönüştür", + "输出信息": "Çıkış bilgisi", + "输出音频(右下角三个点,点了可以下载)": "Ses dosyasını dışa aktar (indirmek için sağ alt köşedeki üç noktaya tıklayın)", + "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Toplu dönüştür. Dönüştürülecek ses dosyalarının bulunduğu klasörü girin veya birden çok ses dosyasını yükleyin. Dönüştürülen ses dosyaları belirtilen klasöre ('opt' varsayılan olarak) dönüştürülecektir", + "指定输出文件夹": "Çıkış klasörünü belirt:", + "输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "İşlenecek ses klasörünün yolunu girin (dosya yöneticisinin adres çubuğundan kopyalayın):", + "也可批量输入音频文件, 二选一, 优先读文件夹": "Toplu olarak ses dosyalarını da girebilirsiniz. İki seçenekten birini seçin. Öncelik klasörden okumaya verilir.", + "导出文件格式": "Dışa aktarma dosya formatı", + "伴奏人声分离&去混响&去回声": "Vokal/Müzik Ayrıştırma ve Yankı Giderme", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "Batch işleme kullanarak vokal eşlik ayrımı için UVR5 modeli kullanılır.
Geçerli bir klasör yol formatı örneği: D:\\path\\to\\input\\folder (dosya yöneticisi adres çubuğundan kopyalanır).
Model üç kategoriye ayrılır:
1. Vokalleri koru: Bu seçeneği, harmoni içermeyen sesler için kullanın. HP5'ten daha iyi bir şekilde vokalleri korur. İki dahili model içerir: HP2 ve HP3. HP3, eşlik sesini hafifçe sızdırabilir, ancak vokalleri HP2'den biraz daha iyi korur.
2. Sadece ana vokalleri koru: Bu seçeneği, harmoni içeren sesler için kullanın. Ana vokalleri zayıflatabilir. Bir dahili model içerir: HP5.
3. Reverb ve gecikme modelleri (FoxJoy tarafından):
  (1) MDX-Net: Stereo reverb'i kaldırmak için en iyi seçenek, ancak mono reverb'i kaldıramaz;
 (234) DeEcho: Gecikme efektlerini kaldırır. Agresif mod, Normal moda göre daha kapsamlı bir şekilde kaldırma yapar. DeReverb ayrıca reverb'i kaldırır ve mono reverb'i kaldırabilir, ancak yoğun yankılı yüksek frekanslı içerikler için çok etkili değildir.
Reverb/gecikme notları:
1. DeEcho-DeReverb modelinin işleme süresi diğer iki DeEcho modeline göre yaklaşık olarak iki kat daha uzundur.
2. MDX-Net-Dereverb modeli oldukça yavaştır.
3. Tavsiye edilen en temiz yapılandırma önce MDX-Net'i uygulamak ve ardından DeEcho-Aggressive uygulamaktır.", + "输入待处理音频文件夹路径": "İşlenecek ses klasörünün yolunu girin:", + "模型": "Model", + "指定输出主人声文件夹": "Vokal için çıkış klasörünü belirtin:", + "指定输出非主人声文件夹": "Müzik ve diğer sesler için çıkış klasörünü belirtin:", + "训练": "Eğitim", + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Adım 1: Deneysel yapılandırmayı doldurun. Deneysel veriler 'logs' klasöründe saklanır ve her bir deney için ayrı bir klasör vardır. Deneysel adı yolu manuel olarak girin; bu yol, deneysel yapılandırmayı, günlükleri ve eğitilmiş model dosyalarını içerir.", + "输入实验名": "Deneysel adı girin:", + "目标采样率": "Hedef örnekleme oranı:", + "模型是否带音高指导(唱歌一定要, 语音可以不要)": "Modelin ses yüksekliği (Pitch) rehberliği içerip içermediği (şarkı söyleme için şarttır, konuşma için isteğe bağlıdır):", + "版本": "Sürüm", + "提取音高和处理数据使用的CPU进程数": "Ses yüksekliği çıkartmak (Pitch) ve verileri işlemek için kullanılacak CPU işlemci sayısı:", + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Adım 2a: Eğitim klasöründe ses dosyalarını otomatik olarak gezinerek dilimleme normalizasyonu yapın. Deney dizini içinde 2 wav klasörü oluşturur. Şu anda sadece tek kişilik eğitim desteklenmektedir.", + "输入训练文件夹路径": "Eğitim klasörünün yolunu girin:", + "请指定说话人id": "Lütfen konuşmacı/sanatçı no belirtin:", + "处理数据": "Verileri işle", + "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Adım 2b: Ses yüksekliği (Pitch) çıkartmak için CPU kullanın (eğer model ses yüksekliği içeriyorsa), özellikleri çıkartmak için GPU kullanın (GPU indeksini seçin):", + "以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2": "GPU indekslerini '-' ile ayırarak girin, örneğin 0-1-2, GPU 0, 1 ve 2'yi kullanmak için:", + "显卡信息": "GPU Bilgisi", + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢": "Ses yüksekliği (Pitch) çıkartma algoritmasını seçin ('pm': daha hızlı çıkartma, ancak düşük kaliteli konuşma; 'dio': geliştirilmiş konuşma kalitesi, ancak daha yavaş çıkartma; 'harvest': daha iyi kalite, ancak daha da yavaş çıkartma):", + "特征提取": "Özellik çıkartma", + "step3: 填写训练设置, 开始训练模型和索引": "Adım 3: Eğitim ayarlarını doldurun ve modeli ve dizini eğitmeye başlayın", + "保存频率save_every_epoch": "Kaydetme sıklığı (save_every_epoch):", + "总训练轮数total_epoch": "Toplam eğitim turu (total_epoch):", + "每张显卡的batch_size": "Her GPU için yığın boyutu (batch_size):", + "是否仅保存最新的ckpt文件以节省硬盘空间": "Sadece en son '.ckpt' dosyasını kaydet:", + "否": "Hayır", + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速": "Tüm eğitim verilerini GPU belleğine önbelleğe alıp almayacağınızı belirtin. Küçük veri setlerini (10 dakikadan az) önbelleğe almak eğitimi hızlandırabilir, ancak büyük veri setlerini önbelleğe almak çok fazla GPU belleği tüketir ve çok fazla hız artışı sağlamaz:", + "是否在每次保存时间点将最终小模型保存至weights文件夹": "Her kaydetme noktasında son küçük bir modeli 'weights' klasörüne kaydetmek için:", + "加载预训练底模G路径": "Önceden eğitilmiş temel G modelini yükleme yolu:", + "加载预训练底模D路径": "Önceden eğitilmiş temel D modelini yükleme yolu:", + "训练模型": "Modeli Eğit", + "训练特征索引": "Özellik Dizinini Eğit", + "一键训练": "Tek Tuşla Eğit", + "ckpt处理": "ckpt İşleme", + "模型融合, 可用于测试音色融合": "Model birleştirme, ses rengi birleştirmesi için kullanılabilir", + "A模型路径": "A Modeli Yolu:", + "B模型路径": "B Modeli Yolu:", + "A模型权重": "A Modeli Ağırlığı:", + "模型是否带音高指导": "Modelin ses yüksekliği rehberi içerip içermediği:", + "要置入的模型信息": "Eklemek için model bilgileri:", + "保存的模型名不带后缀": "Kaydedilecek model adı (uzantı olmadan):", + "模型版本型号": "Model mimari versiyonu:", + "融合": "Birleştir", + "修改模型信息(仅支持weights文件夹下提取的小模型文件)": "Model bilgilerini düzenle (sadece 'weights' klasöründen çıkarılan küçük model dosyaları desteklenir)", + "模型路径": "Model Yolu:", + "要改的模型信息": "Düzenlenecek model bilgileri:", + "保存的文件名, 默认空为和源文件同名": "Kaydedilecek dosya adı (varsayılan: kaynak dosya ile aynı):", + "修改": "Düzenle", + "查看模型信息(仅支持weights文件夹下提取的小模型文件)": "Model bilgilerini görüntüle (sadece 'weights' klasöründen çıkarılan küçük model dosyaları desteklenir)", + "查看": "Görüntüle", + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况": "Model çıkartma (büyük dosya modeli yolunu 'logs' klasöründe girin). Bu, eğitimi yarıda bırakmak istediğinizde ve manuel olarak küçük bir model dosyası çıkartmak ve kaydetmek istediğinizde veya bir ara modeli test etmek istediğinizde kullanışlıdır:", + "保存名": "Kaydetme Adı:", + "模型是否带音高指导,1是0否": "Modelin ses yüksekliği rehberi içerip içermediği (1: evet, 0: hayır):", + "提取": "Çıkart", + "Onnx导出": "Onnx Dışa Aktar", + "RVC模型路径": "RVC Model Yolu:", + "Onnx输出路径": "Onnx Dışa Aktarım Yolu:", + "导出Onnx模型": "Onnx Modeli Dışa Aktar", + "常见问题解答": "Sıkça Sorulan Sorular (SSS)", + "招募音高曲线前端编辑器": "Ses yükseklik eğrisi ön uç düzenleyicisi için işe alım", + "加开发群联系我xxxxx": "Geliştirme grubuna katılın ve benimle iletişime geçin: xxxxx", + "点击查看交流、问题反馈群号": "İletişim ve sorun geri bildirim grup numarasını görüntülemek için tıklayın", + "xxxxx": "xxxxx", + "加载模型": "Model yükle", + "Hubert模型": "Hubert Modeli", + "选择.pth文件": ".pth dosyası seç", + "选择.index文件": ".index dosyası seç", + "选择.npy文件": ".npy dosyası seç", + "输入设备": "Giriş cihazı", + "输出设备": "Çıkış cihazı", + "音频设备(请使用同种类驱动)": "Ses cihazı (aynı tür sürücüyü kullanın)", + "响应阈值": "Tepki eşiği", + "音调设置": "Pitch ayarları", + "Index Rate": "Index Oranı", + "常规设置": "Genel ayarlar", + "采样长度": "Örnekleme uzunluğu", + "淡入淡出长度": "Geçiş (Fade) uzunluğu", + "额外推理时长": "Ekstra çıkartma süresi", + "输入降噪": "Giriş gürültü azaltma", + "输出降噪": "Çıkış gürültü azaltma", + "性能设置": "Performans ayarları", + "开始音频转换": "Ses dönüştürmeyi başlat", + "停止音频转换": "Ses dönüştürmeyi durdur", + "推理时间(ms):": "Çıkarsama süresi (ms):", + "请选择pth文件": "Lütfen .pth dosyası seçin", + "请选择index文件": "Lütfen .index dosyası seçin", + "hubert模型路径不可包含中文": "hubert modeli yolu Çince karakter içeremez", + "pth文件路径不可包含中文": ".pth dosya yolu Çince karakter içeremez", + "index文件路径不可包含中文": ".index dosya yolu Çince karakter içeremez", + "音高算法": "音高算法", + "harvest进程数": "harvest进程数" +} diff --git a/i18n/zh_CN.json b/i18n/zh_CN.json index d7fbe21..31e5d85 100644 --- a/i18n/zh_CN.json +++ b/i18n/zh_CN.json @@ -7,7 +7,7 @@ "step3a:正在训练模型": "step3a:正在训练模型", "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", "全流程结束!": "全流程结束!", - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt.", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.", "模型推理": "模型推理", "推理音色": "推理音色", "刷新音色列表和索引路径": "刷新音色列表和索引路径", @@ -37,6 +37,7 @@ "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹", "导出文件格式": "导出文件格式", "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。", "输入待处理音频文件夹路径": "输入待处理音频文件夹路径", "模型": "模型", "指定输出主人声文件夹": "指定输出主人声文件夹", @@ -94,7 +95,6 @@ "Onnx导出": "Onnx导出", "RVC模型路径": "RVC模型路径", "Onnx输出路径": "Onnx输出路径", - "MoeVS模型": "MoeVS模型", "导出Onnx模型": "导出Onnx模型", "常见问题解答": "常见问题解答", "招募音高曲线前端编辑器": "招募音高曲线前端编辑器", @@ -121,5 +121,12 @@ "性能设置": "性能设置", "开始音频转换": "开始音频转换", "停止音频转换": "停止音频转换", - "推理时间(ms):": "推理时间(ms):" + "推理时间(ms):": "推理时间(ms):", + "请选择pth文件": "请选择pth文件", + "请选择index文件": "请选择index文件", + "hubert模型路径不可包含中文": "hubert模型路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "音高算法": "音高算法", + "harvest进程数": "harvest进程数" } diff --git a/i18n/zh_HK.json b/i18n/zh_HK.json index 4363837..5857c86 100644 --- a/i18n/zh_HK.json +++ b/i18n/zh_HK.json @@ -7,7 +7,7 @@ "step3a:正在训练模型": "step3a:正在训练模型", "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", "全流程结束!": "全流程结束!", - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", "模型推理": "模型推理", "推理音色": "推理音色", "刷新音色列表和索引路径": "刷新音色列表和索引路徑", @@ -37,6 +37,7 @@ "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量輸入音頻檔案,二選一,優先讀資料夾", "导出文件格式": "導出檔格式", "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", "模型": "模型", "指定输出主人声文件夹": "指定输出主人声文件夹", @@ -94,7 +95,6 @@ "Onnx导出": "Onnx导出", "RVC模型路径": "RVC模型路径", "Onnx输出路径": "Onnx输出路径", - "MoeVS模型": "MoeSS模型", "导出Onnx模型": "导出Onnx模型", "常见问题解答": "常見問題解答", "招募音高曲线前端编辑器": "招募音高曲線前端編輯器", @@ -122,5 +122,11 @@ "开始音频转换": "開始音訊轉換", "停止音频转换": "停止音訊轉換", "推理时间(ms):": "推理時間(ms):", - "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。":"使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。" + "请选择pth文件": "请选择pth文件", + "请选择index文件": "请选择index文件", + "hubert模型路径不可包含中文": "hubert模型路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "音高算法": "音高算法", + "harvest进程数": "harvest进程数" } diff --git a/i18n/zh_SG.json b/i18n/zh_SG.json index 1466e6d..2f2a73f 100644 --- a/i18n/zh_SG.json +++ b/i18n/zh_SG.json @@ -7,7 +7,7 @@ "step3a:正在训练模型": "step3a:正在训练模型", "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", "全流程结束!": "全流程结束!", - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", "模型推理": "模型推理", "推理音色": "推理音色", "刷新音色列表和索引路径": "刷新音色列表和索引路徑", @@ -37,6 +37,7 @@ "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量輸入音頻檔案,二選一,優先讀資料夾", "导出文件格式": "導出檔格式", "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", "模型": "模型", "指定输出主人声文件夹": "指定输出主人声文件夹", @@ -94,7 +95,6 @@ "Onnx导出": "Onnx导出", "RVC模型路径": "RVC模型路径", "Onnx输出路径": "Onnx输出路径", - "MoeVS模型": "MoeSS模型", "导出Onnx模型": "导出Onnx模型", "常见问题解答": "常見問題解答", "招募音高曲线前端编辑器": "招募音高曲線前端編輯器", @@ -122,5 +122,11 @@ "开始音频转换": "開始音訊轉換", "停止音频转换": "停止音訊轉換", "推理时间(ms):": "推理時間(ms):", - "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。":"使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。" + "请选择pth文件": "请选择pth文件", + "请选择index文件": "请选择index文件", + "hubert模型路径不可包含中文": "hubert模型路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "音高算法": "音高算法", + "harvest进程数": "harvest进程数" } diff --git a/i18n/zh_TW.json b/i18n/zh_TW.json index 578349a..c8f3340 100644 --- a/i18n/zh_TW.json +++ b/i18n/zh_TW.json @@ -7,7 +7,7 @@ "step3a:正在训练模型": "step3a:正在训练模型", "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", "全流程结束!": "全流程结束!", - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何程式碼和檔案。詳見根目錄使用需遵守的協議-LICENSE.txt。", "模型推理": "模型推理", "推理音色": "推理音色", "刷新音色列表和索引路径": "刷新音色列表和索引路徑", @@ -37,6 +37,7 @@ "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量輸入音頻檔案,二選一,優先讀資料夾", "导出文件格式": "導出檔格式", "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲", + "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。", "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑", "模型": "模型", "指定输出主人声文件夹": "指定输出主人声文件夹", @@ -94,7 +95,6 @@ "Onnx导出": "Onnx导出", "RVC模型路径": "RVC模型路径", "Onnx输出路径": "Onnx输出路径", - "MoeVS模型": "MoeSS模型", "导出Onnx模型": "导出Onnx模型", "常见问题解答": "常見問題解答", "招募音高曲线前端编辑器": "招募音高曲線前端編輯器", @@ -122,5 +122,11 @@ "开始音频转换": "開始音訊轉換", "停止音频转换": "停止音訊轉換", "推理时间(ms):": "推理時間(ms):", - "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
 (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。":"使用UVR5模型進行人聲伴奏分離的批次處理。
有效資料夾路徑格式的例子:D:\\path\\to\\input\\folder(從檔案管理員地址欄複製)。
模型分為三類:
1. 保留人聲:選擇這個選項適用於沒有和聲的音訊。它比HP5更好地保留了人聲。它包括兩個內建模型:HP2和HP3。HP3可能輕微漏出伴奏,但比HP2更好地保留了人聲;
2. 僅保留主人聲:選擇這個選項適用於有和聲的音訊。它可能會削弱主人聲。它包括一個內建模型:HP5。
3. 消除混響和延遲模型(由FoxJoy提供):
  (1) MDX-Net:對於立體聲混響的移除是最好的選擇,但不能移除單聲道混響;
 (234) DeEcho:移除延遲效果。Aggressive模式比Normal模式移除得更徹底。DeReverb另外移除混響,可以移除單聲道混響,但對於高頻重的板式混響移除不乾淨。
消除混響/延遲注意事項:
1. DeEcho-DeReverb模型的處理時間是其他兩個DeEcho模型的近兩倍;
2. MDX-Net-Dereverb模型相當慢;
3. 個人推薦的最乾淨配置是先使用MDX-Net,然後使用DeEcho-Aggressive。" + "请选择pth文件": "请选择pth文件", + "请选择index文件": "请选择index文件", + "hubert模型路径不可包含中文": "hubert模型路径不可包含中文", + "pth文件路径不可包含中文": "pth文件路径不可包含中文", + "index文件路径不可包含中文": "index文件路径不可包含中文", + "音高算法": "音高算法", + "harvest进程数": "harvest进程数" } diff --git a/infer_batch_rvc.py b/infer_batch_rvc.py index 311fe91..4ba8e05 100644 --- a/infer_batch_rvc.py +++ b/infer_batch_rvc.py @@ -8,8 +8,6 @@ import os, sys, pdb, torch now_dir = os.getcwd() sys.path.append(now_dir) -import argparse -import glob import sys import torch import tqdm as tq @@ -102,7 +100,7 @@ opt_path = sys.argv[5] model_path = sys.argv[6] index_rate = float(sys.argv[7]) device = sys.argv[8] -is_half = bool(sys.argv[9]) +is_half = sys.argv[9].lower() != "false" filter_radius = int(sys.argv[10]) resample_sr = int(sys.argv[11]) rms_mix_rate = float(sys.argv[12]) @@ -112,7 +110,7 @@ config = Config(device, is_half) now_dir = os.getcwd() sys.path.append(now_dir) from vc_infer_pipeline import VC -from infer_pack.models import ( +from lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, diff --git a/infer_uvr5.py b/infer_uvr5.py index 884c841..0ffdb5d 100644 --- a/infer_uvr5.py +++ b/infer_uvr5.py @@ -10,12 +10,12 @@ import importlib import numpy as np import hashlib, math from tqdm import tqdm -from uvr5_pack.lib_v5 import spec_utils -from uvr5_pack.utils import _get_name_params, inference -from uvr5_pack.lib_v5.model_param_init import ModelParameters +from lib.uvr5_pack.lib_v5 import spec_utils +from lib.uvr5_pack.utils import _get_name_params, inference +from lib.uvr5_pack.lib_v5.model_param_init import ModelParameters import soundfile as sf -from uvr5_pack.lib_v5.nets_new import CascadedNet -from uvr5_pack.lib_v5 import nets_61968KB as nets +from lib.uvr5_pack.lib_v5.nets_new import CascadedNet +from lib.uvr5_pack.lib_v5 import nets_61968KB as nets class _audio_pre_: @@ -31,7 +31,7 @@ class _audio_pre_: "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v2.json") + mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") model = nets.CascadedASPPNet(mp.param["bins"] * 2) cpk = torch.load(model_path, map_location="cpu") model.load_state_dict(cpk) @@ -195,7 +195,7 @@ class _audio_pre_new: "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v3.json") + mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v3.json") nout = 64 if "DeReverb" in model_path else 48 model = CascadedNet(mp.param["bins"] * 2, nout) cpk = torch.load(model_path, map_location="cpu") diff --git a/infer_pack/attentions.py b/lib/infer_pack/attentions.py similarity index 96% rename from infer_pack/attentions.py rename to lib/infer_pack/attentions.py index a5177ad..84d5c87 100644 --- a/infer_pack/attentions.py +++ b/lib/infer_pack/attentions.py @@ -5,9 +5,9 @@ import torch from torch import nn from torch.nn import functional as F -from infer_pack import commons -from infer_pack import modules -from infer_pack.modules import LayerNorm +from lib.infer_pack import commons +from lib.infer_pack import modules +from lib.infer_pack.modules import LayerNorm class Encoder(nn.Module): diff --git a/infer_pack/commons.py b/lib/infer_pack/commons.py similarity index 100% rename from infer_pack/commons.py rename to lib/infer_pack/commons.py diff --git a/infer_pack/models.py b/lib/infer_pack/models.py similarity index 93% rename from infer_pack/models.py rename to lib/infer_pack/models.py index 724bd88..eb73e78 100644 --- a/infer_pack/models.py +++ b/lib/infer_pack/models.py @@ -3,15 +3,15 @@ from time import time as ttime import torch from torch import nn from torch.nn import functional as F -from infer_pack import modules -from infer_pack import attentions -from infer_pack import commons -from infer_pack.commons import init_weights, get_padding +from lib.infer_pack import modules +from lib.infer_pack import attentions +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from infer_pack.commons import init_weights +from lib.infer_pack.commons import init_weights import numpy as np -from infer_pack import commons +from lib.infer_pack import commons class TextEncoder256(nn.Module): @@ -631,12 +631,17 @@ class SynthesizerTrnMs256NSFsid(nn.Module): o = self.dec(z_slice, pitchf, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + nsff0 = nsff0[:, -head:] z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -742,12 +747,17 @@ class SynthesizerTrnMs768NSFsid(nn.Module): o = self.dec(z_slice, pitchf, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + nsff0 = nsff0[:, -head:] z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -844,12 +854,16 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, sid, max_len=None): + def infer(self, phone, phone_lengths, sid, rate=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) + o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -946,12 +960,16 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, sid, max_len=None): + def infer(self, phone, phone_lengths, sid, rate=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) + o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/lib/infer_pack/models_dml.py b/lib/infer_pack/models_dml.py new file mode 100644 index 0000000..958d7b2 --- /dev/null +++ b/lib/infer_pack/models_dml.py @@ -0,0 +1,1124 @@ +import math, pdb, os +from time import time as ttime +import torch +from torch import nn +from torch.nn import functional as F +from lib.infer_pack import modules +from lib.infer_pack import attentions +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from lib.infer_pack.commons import init_weights +import numpy as np +from lib.infer_pack import commons + + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv.float() + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs256NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/infer_pack/models_onnx.py b/lib/infer_pack/models_onnx.py similarity index 99% rename from infer_pack/models_onnx.py rename to lib/infer_pack/models_onnx.py index b0ed4a7..963e67b 100644 --- a/infer_pack/models_onnx.py +++ b/lib/infer_pack/models_onnx.py @@ -3,15 +3,15 @@ from time import time as ttime import torch from torch import nn from torch.nn import functional as F -from infer_pack import modules -from infer_pack import attentions -from infer_pack import commons -from infer_pack.commons import init_weights, get_padding +from lib.infer_pack import modules +from lib.infer_pack import attentions +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from infer_pack.commons import init_weights +from lib.infer_pack.commons import init_weights import numpy as np -from infer_pack import commons +from lib.infer_pack import commons class TextEncoder256(nn.Module): diff --git a/infer_pack/modules.py b/lib/infer_pack/modules.py similarity index 95% rename from infer_pack/modules.py rename to lib/infer_pack/modules.py index 95e2ea4..b54dc47 100644 --- a/infer_pack/modules.py +++ b/lib/infer_pack/modules.py @@ -9,9 +9,9 @@ from torch.nn import functional as F from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm -from infer_pack import commons -from infer_pack.commons import init_weights, get_padding -from infer_pack.transforms import piecewise_rational_quadratic_transform +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding +from lib.infer_pack.transforms import piecewise_rational_quadratic_transform LRELU_SLOPE = 0.1 diff --git a/infer_pack/modules/F0Predictor/DioF0Predictor.py b/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py similarity index 95% rename from infer_pack/modules/F0Predictor/DioF0Predictor.py rename to lib/infer_pack/modules/F0Predictor/DioF0Predictor.py index ff12512..b5a8e3e 100644 --- a/infer_pack/modules/F0Predictor/DioF0Predictor.py +++ b/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py @@ -1,4 +1,4 @@ -from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor +from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor import pyworld import numpy as np diff --git a/infer_pack/modules/F0Predictor/F0Predictor.py b/lib/infer_pack/modules/F0Predictor/F0Predictor.py similarity index 100% rename from infer_pack/modules/F0Predictor/F0Predictor.py rename to lib/infer_pack/modules/F0Predictor/F0Predictor.py diff --git a/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py similarity index 94% rename from infer_pack/modules/F0Predictor/HarvestF0Predictor.py rename to lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py index 17acb3d..f8dae30 100644 --- a/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +++ b/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -1,4 +1,4 @@ -from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor +from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor import pyworld import numpy as np diff --git a/infer_pack/modules/F0Predictor/PMF0Predictor.py b/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py similarity index 95% rename from infer_pack/modules/F0Predictor/PMF0Predictor.py rename to lib/infer_pack/modules/F0Predictor/PMF0Predictor.py index 5ee2c19..b70de29 100644 --- a/infer_pack/modules/F0Predictor/PMF0Predictor.py +++ b/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py @@ -1,4 +1,4 @@ -from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor +from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor import parselmouth import numpy as np diff --git a/infer_pack/modules/F0Predictor/__init__.py b/lib/infer_pack/modules/F0Predictor/__init__.py similarity index 100% rename from infer_pack/modules/F0Predictor/__init__.py rename to lib/infer_pack/modules/F0Predictor/__init__.py diff --git a/infer_pack/onnx_inference.py b/lib/infer_pack/onnx_inference.py similarity index 91% rename from infer_pack/onnx_inference.py rename to lib/infer_pack/onnx_inference.py index fb583a4..b4aba75 100644 --- a/infer_pack/onnx_inference.py +++ b/lib/infer_pack/onnx_inference.py @@ -33,19 +33,21 @@ class ContentVec: def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): if f0_predictor == "pm": - from infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor + from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor f0_predictor_object = PMF0Predictor( hop_length=hop_length, sampling_rate=sampling_rate ) elif f0_predictor == "harvest": - from infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor + from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import ( + HarvestF0Predictor, + ) f0_predictor_object = HarvestF0Predictor( hop_length=hop_length, sampling_rate=sampling_rate ) elif f0_predictor == "dio": - from infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor + from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor f0_predictor_object = DioF0Predictor( hop_length=hop_length, sampling_rate=sampling_rate diff --git a/infer_pack/transforms.py b/lib/infer_pack/transforms.py similarity index 100% rename from infer_pack/transforms.py rename to lib/infer_pack/transforms.py diff --git a/uvr5_pack/lib_v5/dataset.py b/lib/uvr5_pack/lib_v5/dataset.py similarity index 99% rename from uvr5_pack/lib_v5/dataset.py rename to lib/uvr5_pack/lib_v5/dataset.py index ba0e45b..cfd01a1 100644 --- a/uvr5_pack/lib_v5/dataset.py +++ b/lib/uvr5_pack/lib_v5/dataset.py @@ -6,7 +6,7 @@ import torch import torch.utils.data from tqdm import tqdm -from uvr5_pack.lib_v5 import spec_utils +from . import spec_utils class VocalRemoverValidationSet(torch.utils.data.Dataset): diff --git a/uvr5_pack/lib_v5/layers.py b/lib/uvr5_pack/lib_v5/layers.py similarity index 98% rename from uvr5_pack/lib_v5/layers.py rename to lib/uvr5_pack/lib_v5/layers.py index 9835dc0..b82f06b 100644 --- a/uvr5_pack/lib_v5/layers.py +++ b/lib/uvr5_pack/lib_v5/layers.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import spec_utils +from . import spec_utils class Conv2DBNActiv(nn.Module): diff --git a/uvr5_pack/lib_v5/layers_123812KB .py b/lib/uvr5_pack/lib_v5/layers_123812KB .py similarity index 98% rename from uvr5_pack/lib_v5/layers_123812KB .py rename to lib/uvr5_pack/lib_v5/layers_123812KB .py index 9835dc0..b82f06b 100644 --- a/uvr5_pack/lib_v5/layers_123812KB .py +++ b/lib/uvr5_pack/lib_v5/layers_123812KB .py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import spec_utils +from . import spec_utils class Conv2DBNActiv(nn.Module): diff --git a/uvr5_pack/lib_v5/layers_123821KB.py b/lib/uvr5_pack/lib_v5/layers_123821KB.py similarity index 98% rename from uvr5_pack/lib_v5/layers_123821KB.py rename to lib/uvr5_pack/lib_v5/layers_123821KB.py index 9835dc0..b82f06b 100644 --- a/uvr5_pack/lib_v5/layers_123821KB.py +++ b/lib/uvr5_pack/lib_v5/layers_123821KB.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import spec_utils +from . import spec_utils class Conv2DBNActiv(nn.Module): diff --git a/uvr5_pack/lib_v5/layers_33966KB.py b/lib/uvr5_pack/lib_v5/layers_33966KB.py similarity index 99% rename from uvr5_pack/lib_v5/layers_33966KB.py rename to lib/uvr5_pack/lib_v5/layers_33966KB.py index 78e5392..a38b7bb 100644 --- a/uvr5_pack/lib_v5/layers_33966KB.py +++ b/lib/uvr5_pack/lib_v5/layers_33966KB.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import spec_utils +from . import spec_utils class Conv2DBNActiv(nn.Module): diff --git a/uvr5_pack/lib_v5/layers_537227KB.py b/lib/uvr5_pack/lib_v5/layers_537227KB.py similarity index 99% rename from uvr5_pack/lib_v5/layers_537227KB.py rename to lib/uvr5_pack/lib_v5/layers_537227KB.py index 78e5392..a38b7bb 100644 --- a/uvr5_pack/lib_v5/layers_537227KB.py +++ b/lib/uvr5_pack/lib_v5/layers_537227KB.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import spec_utils +from . import spec_utils class Conv2DBNActiv(nn.Module): diff --git a/uvr5_pack/lib_v5/layers_537238KB.py b/lib/uvr5_pack/lib_v5/layers_537238KB.py similarity index 99% rename from uvr5_pack/lib_v5/layers_537238KB.py rename to lib/uvr5_pack/lib_v5/layers_537238KB.py index 78e5392..a38b7bb 100644 --- a/uvr5_pack/lib_v5/layers_537238KB.py +++ b/lib/uvr5_pack/lib_v5/layers_537238KB.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import spec_utils +from . import spec_utils class Conv2DBNActiv(nn.Module): diff --git a/uvr5_pack/lib_v5/layers_new.py b/lib/uvr5_pack/lib_v5/layers_new.py similarity index 99% rename from uvr5_pack/lib_v5/layers_new.py rename to lib/uvr5_pack/lib_v5/layers_new.py index 2441f2d..0c13e60 100644 --- a/uvr5_pack/lib_v5/layers_new.py +++ b/lib/uvr5_pack/lib_v5/layers_new.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import spec_utils +from . import spec_utils class Conv2DBNActiv(nn.Module): diff --git a/uvr5_pack/lib_v5/model_param_init.py b/lib/uvr5_pack/lib_v5/model_param_init.py similarity index 100% rename from uvr5_pack/lib_v5/model_param_init.py rename to lib/uvr5_pack/lib_v5/model_param_init.py diff --git a/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json b/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json rename to lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json diff --git a/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json b/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json rename to lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json diff --git a/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json b/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json rename to lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json diff --git a/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json b/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json rename to lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json diff --git a/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json b/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json rename to lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json diff --git a/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json b/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json rename to lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json diff --git a/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json rename to lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json diff --git a/uvr5_pack/lib_v5/modelparams/2band_32000.json b/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/2band_32000.json rename to lib/uvr5_pack/lib_v5/modelparams/2band_32000.json diff --git a/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json b/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json rename to lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json diff --git a/uvr5_pack/lib_v5/modelparams/2band_48000.json b/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/2band_48000.json rename to lib/uvr5_pack/lib_v5/modelparams/2band_48000.json diff --git a/uvr5_pack/lib_v5/modelparams/3band_44100.json b/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/3band_44100.json rename to lib/uvr5_pack/lib_v5/modelparams/3band_44100.json diff --git a/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json b/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/3band_44100_mid.json rename to lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json diff --git a/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json b/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json rename to lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json diff --git a/uvr5_pack/lib_v5/modelparams/4band_44100.json b/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/4band_44100.json rename to lib/uvr5_pack/lib_v5/modelparams/4band_44100.json diff --git a/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json b/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/4band_44100_mid.json rename to lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json diff --git a/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json b/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/4band_44100_msb.json rename to lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json diff --git a/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json b/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json rename to lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json diff --git a/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json b/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json rename to lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json diff --git a/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json b/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/4band_44100_sw.json rename to lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json diff --git a/uvr5_pack/lib_v5/modelparams/4band_v2.json b/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/4band_v2.json rename to lib/uvr5_pack/lib_v5/modelparams/4band_v2.json diff --git a/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json b/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/4band_v2_sn.json rename to lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json diff --git a/uvr5_pack/lib_v5/modelparams/4band_v3.json b/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/4band_v3.json rename to lib/uvr5_pack/lib_v5/modelparams/4band_v3.json diff --git a/uvr5_pack/lib_v5/modelparams/ensemble.json b/lib/uvr5_pack/lib_v5/modelparams/ensemble.json similarity index 100% rename from uvr5_pack/lib_v5/modelparams/ensemble.json rename to lib/uvr5_pack/lib_v5/modelparams/ensemble.json diff --git a/uvr5_pack/lib_v5/nets.py b/lib/uvr5_pack/lib_v5/nets.py similarity index 98% rename from uvr5_pack/lib_v5/nets.py rename to lib/uvr5_pack/lib_v5/nets.py index d4c376e..db4c5e3 100644 --- a/uvr5_pack/lib_v5/nets.py +++ b/lib/uvr5_pack/lib_v5/nets.py @@ -2,8 +2,8 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import layers -from uvr5_pack.lib_v5 import spec_utils +import layers +from . import spec_utils class BaseASPPNet(nn.Module): diff --git a/uvr5_pack/lib_v5/nets_123812KB.py b/lib/uvr5_pack/lib_v5/nets_123812KB.py similarity index 98% rename from uvr5_pack/lib_v5/nets_123812KB.py rename to lib/uvr5_pack/lib_v5/nets_123812KB.py index ea6c45c..becbfae 100644 --- a/uvr5_pack/lib_v5/nets_123812KB.py +++ b/lib/uvr5_pack/lib_v5/nets_123812KB.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import layers_123821KB as layers +from . import layers_123821KB as layers class BaseASPPNet(nn.Module): diff --git a/uvr5_pack/lib_v5/nets_123821KB.py b/lib/uvr5_pack/lib_v5/nets_123821KB.py similarity index 98% rename from uvr5_pack/lib_v5/nets_123821KB.py rename to lib/uvr5_pack/lib_v5/nets_123821KB.py index ea6c45c..becbfae 100644 --- a/uvr5_pack/lib_v5/nets_123821KB.py +++ b/lib/uvr5_pack/lib_v5/nets_123821KB.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import layers_123821KB as layers +from . import layers_123821KB as layers class BaseASPPNet(nn.Module): diff --git a/uvr5_pack/lib_v5/nets_33966KB.py b/lib/uvr5_pack/lib_v5/nets_33966KB.py similarity index 98% rename from uvr5_pack/lib_v5/nets_33966KB.py rename to lib/uvr5_pack/lib_v5/nets_33966KB.py index d2bddb1..b8986f9 100644 --- a/uvr5_pack/lib_v5/nets_33966KB.py +++ b/lib/uvr5_pack/lib_v5/nets_33966KB.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import layers_33966KB as layers +from . import layers_33966KB as layers class BaseASPPNet(nn.Module): diff --git a/uvr5_pack/lib_v5/nets_537227KB.py b/lib/uvr5_pack/lib_v5/nets_537227KB.py similarity index 98% rename from uvr5_pack/lib_v5/nets_537227KB.py rename to lib/uvr5_pack/lib_v5/nets_537227KB.py index 1ceac4a..a1bb530 100644 --- a/uvr5_pack/lib_v5/nets_537227KB.py +++ b/lib/uvr5_pack/lib_v5/nets_537227KB.py @@ -3,7 +3,7 @@ import numpy as np from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import layers_537238KB as layers +from . import layers_537238KB as layers class BaseASPPNet(nn.Module): diff --git a/uvr5_pack/lib_v5/nets_537238KB.py b/lib/uvr5_pack/lib_v5/nets_537238KB.py similarity index 98% rename from uvr5_pack/lib_v5/nets_537238KB.py rename to lib/uvr5_pack/lib_v5/nets_537238KB.py index 1ceac4a..a1bb530 100644 --- a/uvr5_pack/lib_v5/nets_537238KB.py +++ b/lib/uvr5_pack/lib_v5/nets_537238KB.py @@ -3,7 +3,7 @@ import numpy as np from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import layers_537238KB as layers +from . import layers_537238KB as layers class BaseASPPNet(nn.Module): diff --git a/uvr5_pack/lib_v5/nets_61968KB.py b/lib/uvr5_pack/lib_v5/nets_61968KB.py similarity index 98% rename from uvr5_pack/lib_v5/nets_61968KB.py rename to lib/uvr5_pack/lib_v5/nets_61968KB.py index ea6c45c..becbfae 100644 --- a/uvr5_pack/lib_v5/nets_61968KB.py +++ b/lib/uvr5_pack/lib_v5/nets_61968KB.py @@ -2,7 +2,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import layers_123821KB as layers +from . import layers_123821KB as layers class BaseASPPNet(nn.Module): diff --git a/uvr5_pack/lib_v5/nets_new.py b/lib/uvr5_pack/lib_v5/nets_new.py similarity index 77% rename from uvr5_pack/lib_v5/nets_new.py rename to lib/uvr5_pack/lib_v5/nets_new.py index c9898f6..bfaf72e 100644 --- a/uvr5_pack/lib_v5/nets_new.py +++ b/lib/uvr5_pack/lib_v5/nets_new.py @@ -1,7 +1,7 @@ import torch from torch import nn import torch.nn.functional as F -from uvr5_pack.lib_v5 import layers_new as layers +from . import layers_new class BaseNet(nn.Module): @@ -9,19 +9,19 @@ class BaseNet(nn.Module): self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) ): super(BaseNet, self).__init__() - self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1) - self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1) - self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1) - self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1) - self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1) + self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) + self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) + self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1) + self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1) + self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1) - self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) + self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) - self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) - self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) - self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) - self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm) - self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) + self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) + self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) + self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) + self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm) + self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) def __call__(self, x): e1 = self.enc1(x) @@ -52,7 +52,7 @@ class CascadedNet(nn.Module): self.stg1_low_band_net = nn.Sequential( BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), - layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), + layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), ) self.stg1_high_band_net = BaseNet( @@ -61,7 +61,7 @@ class CascadedNet(nn.Module): self.stg2_low_band_net = nn.Sequential( BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), - layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), + layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), ) self.stg2_high_band_net = BaseNet( nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 diff --git a/uvr5_pack/lib_v5/spec_utils.py b/lib/uvr5_pack/lib_v5/spec_utils.py similarity index 100% rename from uvr5_pack/lib_v5/spec_utils.py rename to lib/uvr5_pack/lib_v5/spec_utils.py diff --git a/uvr5_pack/name_params.json b/lib/uvr5_pack/name_params.json similarity index 63% rename from uvr5_pack/name_params.json rename to lib/uvr5_pack/name_params.json index cb66091..950adcf 100644 --- a/uvr5_pack/name_params.json +++ b/lib/uvr5_pack/name_params.json @@ -4,92 +4,92 @@ "model_hash_name" : [ { "hash_name": "47939caf0cfe52a0e81442b85b971dfd", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", "param_name": "4band_v2" }, { "hash_name": "ca106edd563e034bde0bdec4bb7a4b36", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", "param_name": "4band_v2" }, { "hash_name": "e60a1e84803ce4efc0a6551206cc4b71", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "a82f14e75892e55e994376edbf0c8435", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", "param_name": "4band_v2_sn" }, { "hash_name": "08611fb99bd59eaa79ad27c58d137727", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", "param_name": "4band_v2_sn" }, { "hash_name": "5c7bbca45a187e81abbbd351606164e5", - "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", "param_name": "3band_44100_msb2" }, { "hash_name": "d6b2cb685a058a091e5e7098192d3233", - "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", "param_name": "3band_44100_msb2" }, { "hash_name": "c1b9f38170a7c90e96f027992eb7c62b", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "c3448ec923fa0edf3d03a19e633faa53", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "68aa2c8093d0080704b200d140f59e54", - "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", "param_name": "3band_44100" }, { "hash_name": "fdc83be5b798e4bd29fe00fe6600e147", - "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", "param_name": "3band_44100_mid.json" }, { "hash_name": "2ce34bc92fd57f55db16b7a4def3d745", - "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", "param_name": "3band_44100_mid.json" }, { "hash_name": "52fdca89576f06cf4340b74a4730ee5f", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100.json" }, { "hash_name": "41191165b05d38fc77f072fa9e8e8a30", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100.json" }, { "hash_name": "89e83b511ad474592689e562d5b1f80e", - "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", "param_name": "2band_32000.json" }, { "hash_name": "0b954da81d453b716b114d6d7c95177f", - "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", "param_name": "2band_32000.json" } @@ -97,47 +97,47 @@ "v4 Models": [ { "hash_name": "6a00461c51c2920fd68937d4609ed6c8", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", "param_name": "1band_sr16000_hl512" }, { "hash_name": "0ab504864d20f1bd378fe9c81ef37140", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr32000_hl512" }, { "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr32000_hl512" }, { "hash_name": "80ab74d65e515caa3622728d2de07d23", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr32000_hl512" }, { "hash_name": "edc115e7fc523245062200c00caa847f", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", "param_name": "1band_sr33075_hl384" }, { "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", "param_name": "1band_sr33075_hl384" }, { "hash_name": "b58090534c52cbc3e9b5104bad666ef2", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", "param_name": "1band_sr44100_hl512" }, { "hash_name": "0cdab9947f1b0928705f518f3c78ea8f", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", "param_name": "1band_sr44100_hl512" }, { "hash_name": "ae702fed0238afb5346db8356fe25f13", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", "param_name": "1band_sr44100_hl1024" } ] @@ -148,113 +148,113 @@ "1 Band": [ { "hash_name": "1band_sr16000_hl512", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", "param_name": "1band_sr16000_hl512" }, { "hash_name": "1band_sr32000_hl512", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr16000_hl512" }, { "hash_name": "1band_sr33075_hl384", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", "param_name": "1band_sr33075_hl384" }, { "hash_name": "1band_sr44100_hl256", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", "param_name": "1band_sr44100_hl256" }, { "hash_name": "1band_sr44100_hl512", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", "param_name": "1band_sr44100_hl512" }, { "hash_name": "1band_sr44100_hl1024", - "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", "param_name": "1band_sr44100_hl1024" } ], "2 Band": [ { "hash_name": "2band_44100_lofi", - "model_params": "uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", "param_name": "2band_44100_lofi" }, { "hash_name": "2band_32000", - "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", "param_name": "2band_32000" }, { "hash_name": "2band_48000", - "model_params": "uvr5_pack/lib_v5/modelparams/2band_48000.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_48000.json", "param_name": "2band_48000" } ], "3 Band": [ { "hash_name": "3band_44100", - "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", "param_name": "3band_44100" }, { "hash_name": "3band_44100_mid", - "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", "param_name": "3band_44100_mid" }, { "hash_name": "3band_44100_msb2", - "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", "param_name": "3band_44100_msb2" } ], "4 Band": [ { "hash_name": "4band_44100", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "4band_44100_mid", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", "param_name": "4band_44100_mid" }, { "hash_name": "4band_44100_msb", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", "param_name": "4band_44100_msb" }, { "hash_name": "4band_44100_msb2", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", "param_name": "4band_44100_msb2" }, { "hash_name": "4band_44100_reverse", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", "param_name": "4band_44100_reverse" }, { "hash_name": "4band_44100_sw", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", "param_name": "4band_44100_sw" }, { "hash_name": "4band_v2", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", "param_name": "4band_v2" }, { "hash_name": "4band_v2_sn", - "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", "param_name": "4band_v2_sn" }, { "hash_name": "tmodelparam", - "model_params": "uvr5_pack/lib_v5/modelparams/tmodelparam.json", + "model_params": "lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json", "param_name": "User Model Param Set" } ] diff --git a/uvr5_pack/utils.py b/lib/uvr5_pack/utils.py similarity index 97% rename from uvr5_pack/utils.py rename to lib/uvr5_pack/utils.py index 1d91f96..0fafe87 100644 --- a/uvr5_pack/utils.py +++ b/lib/uvr5_pack/utils.py @@ -4,7 +4,7 @@ from tqdm import tqdm import json -def load_data(file_name: str = "./uvr5_pack/name_params.json") -> dict: +def load_data(file_name: str = "./lib/uvr5_pack/name_params.json") -> dict: with open(file_name, "r") as f: data = json.load(f) diff --git a/poetry.lock b/poetry.lock index dd82b15..f7aad0a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "absl-py" version = "1.4.0" description = "Abseil Python Common Libraries, see https://github.com/abseil/abseil-py." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -16,7 +15,6 @@ files = [ name = "aiofiles" version = "23.1.0" description = "File support for asyncio." -category = "main" optional = false python-versions = ">=3.7,<4.0" files = [ @@ -28,7 +26,6 @@ files = [ name = "aiohttp" version = "3.8.4" description = "Async http client/server framework (asyncio)" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -137,7 +134,6 @@ speedups = ["Brotli", "aiodns", "cchardet"] name = "aiosignal" version = "1.3.1" description = "aiosignal: a list of registered asynchronous callbacks" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -152,7 +148,6 @@ frozenlist = ">=1.1.0" name = "altair" version = "4.2.2" description = "Altair: A declarative statistical visualization library for Python." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -175,7 +170,6 @@ dev = ["black", "docutils", "flake8", "ipython", "m2r", "mistune (<2.0.0)", "pyt name = "antlr4-python3-runtime" version = "4.8" description = "ANTLR 4.8 runtime for Python 3.7" -category = "main" optional = false python-versions = "*" files = [ @@ -186,7 +180,6 @@ files = [ name = "anyio" version = "3.6.2" description = "High level compatibility layer for multiple asynchronous event loop implementations" -category = "main" optional = false python-versions = ">=3.6.2" files = [ @@ -207,7 +200,6 @@ trio = ["trio (>=0.16,<0.22)"] name = "async-timeout" version = "4.0.2" description = "Timeout context manager for asyncio programs" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -219,7 +211,6 @@ files = [ name = "attrs" version = "22.2.0" description = "Classes Without Boilerplate" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -238,7 +229,6 @@ tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy name = "audioread" version = "3.0.0" description = "multi-library, cross-platform audio decoding" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -249,7 +239,6 @@ files = [ name = "bitarray" version = "2.7.3" description = "efficient arrays of booleans -- C extension" -category = "main" optional = false python-versions = "*" files = [ @@ -346,7 +335,6 @@ files = [ name = "cachetools" version = "5.3.0" description = "Extensible memoizing collections and decorators" -category = "main" optional = false python-versions = "~=3.7" files = [ @@ -358,7 +346,6 @@ files = [ name = "certifi" version = "2022.12.7" description = "Python package for providing Mozilla's CA Bundle." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -370,7 +357,6 @@ files = [ name = "cffi" version = "1.15.1" description = "Foreign Function Interface for Python calling C code." -category = "main" optional = false python-versions = "*" files = [ @@ -447,7 +433,6 @@ pycparser = "*" name = "charset-normalizer" version = "3.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -532,7 +517,6 @@ files = [ name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -547,7 +531,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "cmake" version = "3.26.1" description = "CMake is an open-source, cross-platform family of tools designed to build, test and package software" -category = "main" optional = false python-versions = "*" files = [ @@ -577,7 +560,6 @@ test = ["codecov (>=2.0.5)", "coverage (>=4.2)", "flake8 (>=3.0.4)", "path.py (> name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -589,7 +571,6 @@ files = [ name = "contourpy" version = "1.0.7" description = "Python library for calculating contours of 2D quadrilateral grids" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -664,7 +645,6 @@ test-no-images = ["pytest"] name = "cycler" version = "0.11.0" description = "Composable style cycles" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -676,7 +656,6 @@ files = [ name = "cython" version = "0.29.34" description = "The Cython compiler for writing C extensions for the Python language." -category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -726,7 +705,6 @@ files = [ name = "decorator" version = "5.1.1" description = "Decorators for Humans" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -738,7 +716,6 @@ files = [ name = "entrypoints" version = "0.4" description = "Discover and load entry points from installed packages." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -750,7 +727,6 @@ files = [ name = "fairseq" version = "0.12.2" description = "Facebook AI Research Sequence-to-Sequence Toolkit" -category = "main" optional = false python-versions = "*" files = [ @@ -780,7 +756,6 @@ tqdm = "*" name = "faiss-cpu" version = "1.7.3" description = "A library for efficient similarity search and clustering of dense vectors." -category = "main" optional = false python-versions = "*" files = [ @@ -815,7 +790,6 @@ files = [ name = "fastapi" version = "0.95.2" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -837,7 +811,6 @@ test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==23.1.0)", "coverage[toml] (>=6 name = "ffmpeg-python" version = "0.2.0" description = "Python bindings for FFmpeg - with complex filtering support" -category = "main" optional = false python-versions = "*" files = [ @@ -855,7 +828,6 @@ dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4 name = "ffmpy" version = "0.3.0" description = "A simple Python wrapper for ffmpeg" -category = "main" optional = false python-versions = "*" files = [ @@ -866,7 +838,6 @@ files = [ name = "filelock" version = "3.10.7" description = "A platform independent file lock." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -882,7 +853,6 @@ testing = ["covdefaults (>=2.3)", "coverage (>=7.2.2)", "diff-cover (>=7.5)", "p name = "fonttools" version = "4.39.3" description = "Tools to manipulate font files" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -908,7 +878,6 @@ woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] name = "frozenlist" version = "1.3.3" description = "A list-like structure which implements collections.abc.MutableSequence" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -992,7 +961,6 @@ files = [ name = "fsspec" version = "2023.3.0" description = "File-system specification" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1026,7 +994,6 @@ tqdm = ["tqdm"] name = "functorch" version = "2.0.0" description = "JAX-like composable function transforms for PyTorch" -category = "main" optional = false python-versions = "*" files = [ @@ -1043,7 +1010,6 @@ aot = ["networkx"] name = "future" version = "0.18.3" description = "Clean single-source support for Python 3 and 2" -category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -1054,7 +1020,6 @@ files = [ name = "google-auth" version = "2.17.1" description = "Google Authentication Library" -category = "main" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*" files = [ @@ -1079,7 +1044,6 @@ requests = ["requests (>=2.20.0,<3.0.0dev)"] name = "google-auth-oauthlib" version = "1.0.0" description = "Google Authentication Library" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1096,14 +1060,13 @@ tool = ["click (>=6.0.0)"] [[package]] name = "gradio" -version = "3.24.1" +version = "3.34.0" description = "Python library for easily interacting with trained machine learning models" -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "gradio-3.24.1-py3-none-any.whl", hash = "sha256:a9b2fb94007b370ba5442159775a0987b27810451582525fe10711517082b1d2"}, - {file = "gradio-3.24.1.tar.gz", hash = "sha256:19f11c8b5d9b23a22a7e0e3a32332146474d215588674afdfdbe2f86fe0a7811"}, + {file = "gradio-3.34.0-py3-none-any.whl", hash = "sha256:1cd8b25b598d983561d64f0a039af819382f1376c676aa9f84972c46b6875741"}, + {file = "gradio-3.34.0.tar.gz", hash = "sha256:fd7fa7257ffc749f9dc7c297eba554eaa1e5acd1a5f9c973250b2080932d6a41"}, ] [package.dependencies] @@ -1112,9 +1075,9 @@ aiohttp = "*" altair = ">=4.2.0" fastapi = "*" ffmpy = "*" -gradio-client = ">=0.0.5" +gradio-client = ">=0.2.6" httpx = "*" -huggingface-hub = ">=0.13.0" +huggingface-hub = ">=0.14.0" jinja2 = "*" markdown-it-py = {version = ">=2.0.0", extras = ["linkify"]} markupsafe = "*" @@ -1126,28 +1089,29 @@ pandas = "*" pillow = "*" pydantic = "*" pydub = "*" +pygments = ">=2.12.0" python-multipart = "*" pyyaml = "*" requests = "*" semantic-version = "*" typing-extensions = "*" -uvicorn = "*" +uvicorn = ">=0.14.0" websockets = ">=10.0" [[package]] name = "gradio-client" -version = "0.0.7" +version = "0.2.7" description = "Python library for easily interacting with trained machine learning models" -category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "gradio_client-0.0.7-py3-none-any.whl", hash = "sha256:71887ff8ce9ce85b12dd1342eeed4ecdbe82c3b3d5a66f05f6dfcb17c9874631"}, - {file = "gradio_client-0.0.7.tar.gz", hash = "sha256:c7cababd6213fc3541b4cd68a69305dee496125d0d18bc9a1326484c49275858"}, + {file = "gradio_client-0.2.7-py3-none-any.whl", hash = "sha256:4a7ec6bb1341c626051f1ed24d50cb960ff1a4cd1a5db031dd4caaf1ee7d2d0a"}, + {file = "gradio_client-0.2.7.tar.gz", hash = "sha256:c83008df8a1dd3f81a290c0a24c03d0ab70317741991b60f713620ed39ad8f12"}, ] [package.dependencies] fsspec = "*" +httpx = "*" huggingface-hub = ">=0.13.0" packaging = "*" requests = "*" @@ -1158,7 +1122,6 @@ websockets = "*" name = "grpcio" version = "1.53.0" description = "HTTP/2-based RPC framework" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1216,7 +1179,6 @@ protobuf = ["grpcio-tools (>=1.53.0)"] name = "h11" version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1228,7 +1190,6 @@ files = [ name = "httpcore" version = "0.16.3" description = "A minimal low-level HTTP client." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1240,17 +1201,16 @@ files = [ anyio = ">=3.0,<5.0" certifi = "*" h11 = ">=0.13,<0.15" -sniffio = ">=1.0.0,<2.0.0" +sniffio = "==1.*" [package.extras] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] +socks = ["socksio (==1.*)"] [[package]] name = "httpx" version = "0.23.3" description = "The next generation HTTP client." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1266,24 +1226,24 @@ sniffio = "*" [package.extras] brotli = ["brotli", "brotlicffi"] -cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<13)"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<13)"] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] +socks = ["socksio (==1.*)"] [[package]] name = "huggingface-hub" -version = "0.13.3" +version = "0.15.1" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" -category = "main" optional = false python-versions = ">=3.7.0" files = [ - {file = "huggingface_hub-0.13.3-py3-none-any.whl", hash = "sha256:f73a298a55028575334f9670d86b8171a4dd890b320315f3ad28a20b9eb3b5bc"}, - {file = "huggingface_hub-0.13.3.tar.gz", hash = "sha256:1f95f65c5e7aa76728701402f55b697ee8a8b50234adda91fbdbb81038fbcd21"}, + {file = "huggingface_hub-0.15.1-py3-none-any.whl", hash = "sha256:05b0fb0abbf1f625dfee864648ac3049fe225ac4371c7bafaca0c2d3a2f83445"}, + {file = "huggingface_hub-0.15.1.tar.gz", hash = "sha256:a61b7d1a7769fe10119e730277c72ab99d95c48d86a3d6da3e9f3d0f632a4081"}, ] [package.dependencies] filelock = "*" +fsspec = "*" packaging = ">=20.9" pyyaml = ">=5.1" requests = "*" @@ -1291,13 +1251,13 @@ tqdm = ">=4.42.1" typing-extensions = ">=3.7.4.3" [package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pytest", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"] cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pytest", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"] tensorflow = ["graphviz", "pydot", "tensorflow"] -testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "jedi", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "soundfile"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "gradio", "jedi", "numpy", "pytest", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] torch = ["torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] @@ -1305,7 +1265,6 @@ typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "t name = "hydra-core" version = "1.0.7" description = "A framework for elegantly configuring complex applications" -category = "main" optional = false python-versions = "*" files = [ @@ -1322,7 +1281,6 @@ omegaconf = ">=2.0.5,<2.1" name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -1334,7 +1292,6 @@ files = [ name = "importlib-metadata" version = "6.1.0" description = "Read metadata from Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1354,7 +1311,6 @@ testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packag name = "importlib-resources" version = "5.12.0" description = "Read resources from Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1373,7 +1329,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "jinja2" version = "3.1.2" description = "A very fast and expressive template engine." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1391,7 +1346,6 @@ i18n = ["Babel (>=2.7)"] name = "joblib" version = "1.2.0" description = "Lightweight pipelining with Python functions" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1403,7 +1357,6 @@ files = [ name = "json5" version = "0.9.11" description = "A Python implementation of the JSON5 data format." -category = "main" optional = false python-versions = "*" files = [ @@ -1418,7 +1371,6 @@ dev = ["hypothesis"] name = "jsonschema" version = "4.17.3" description = "An implementation of JSON Schema validation for Python" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1440,7 +1392,6 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339- name = "kiwisolver" version = "1.4.4" description = "A fast implementation of the Cassowary constraint solver" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1518,7 +1469,6 @@ files = [ name = "librosa" version = "0.9.2" description = "Python module for audio and music processing" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1541,14 +1491,13 @@ soundfile = ">=0.10.2" [package.extras] display = ["matplotlib (>=3.3.0)"] -docs = ["ipython (>=7.0)", "matplotlib (>=3.3.0)", "mir-eval (>=0.5)", "numba (<0.50)", "numpydoc", "presets", "sphinx (!=1.3.1)", "sphinx-gallery (>=0.7)", "sphinx-multiversion (>=0.2.3)", "sphinx-rtd-theme (>=1.0.0,<2.0.0)", "sphinxcontrib-svg2pdfconverter"] +docs = ["ipython (>=7.0)", "matplotlib (>=3.3.0)", "mir-eval (>=0.5)", "numba (<0.50)", "numpydoc", "presets", "sphinx (!=1.3.1)", "sphinx-gallery (>=0.7)", "sphinx-multiversion (>=0.2.3)", "sphinx-rtd-theme (==1.*)", "sphinxcontrib-svg2pdfconverter"] tests = ["contextlib2", "matplotlib (>=3.3.0)", "pytest", "pytest-cov", "pytest-mpl", "samplerate", "soxr"] [[package]] name = "linkify-it-py" version = "2.0.0" description = "Links recognition library with FULL unicode support." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1569,7 +1518,6 @@ test = ["coverage", "pytest", "pytest-cov"] name = "lit" version = "16.0.0" description = "A Software Testing Tool" -category = "main" optional = false python-versions = "*" files = [ @@ -1580,7 +1528,6 @@ files = [ name = "llvmlite" version = "0.39.0" description = "lightweight wrapper around basic LLVM functionality" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1618,7 +1565,6 @@ files = [ name = "lxml" version = "4.9.2" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" files = [ @@ -1711,7 +1657,6 @@ source = ["Cython (>=0.29.7)"] name = "markdown" version = "3.4.3" description = "Python implementation of John Gruber's Markdown." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1729,7 +1674,6 @@ testing = ["coverage", "pyyaml"] name = "markdown-it-py" version = "2.2.0" description = "Python port of markdown-it. Markdown parsing, done right!" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1755,7 +1699,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] name = "markupsafe" version = "2.1.2" description = "Safely add untrusted strings to HTML/XML markup." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1815,7 +1758,6 @@ files = [ name = "matplotlib" version = "3.7.1" description = "Python plotting package" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1878,7 +1820,6 @@ python-dateutil = ">=2.7" name = "matplotlib-inline" version = "0.1.6" description = "Inline Matplotlib backend for Jupyter" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -1893,7 +1834,6 @@ traitlets = "*" name = "mdit-py-plugins" version = "0.3.3" description = "Collection of plugins for markdown-it-py" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1913,7 +1853,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1925,7 +1864,6 @@ files = [ name = "mpmath" version = "1.3.0" description = "Python library for arbitrary-precision floating-point arithmetic" -category = "main" optional = false python-versions = "*" files = [ @@ -1943,7 +1881,6 @@ tests = ["pytest (>=4.6)"] name = "multidict" version = "6.0.4" description = "multidict implementation" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2027,7 +1964,6 @@ files = [ name = "networkx" version = "3.1" description = "Python package for creating and manipulating graphs and networks" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2046,7 +1982,6 @@ test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] name = "numba" version = "0.56.4" description = "compiling Python code using LLVM" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2082,7 +2017,7 @@ files = [ [package.dependencies] importlib-metadata = {version = "*", markers = "python_version < \"3.9\""} -llvmlite = ">=0.39.0dev0,<0.40" +llvmlite = "==0.39.*" numpy = ">=1.18,<1.24" setuptools = "*" @@ -2090,7 +2025,6 @@ setuptools = "*" name = "numpy" version = "1.23.5" description = "NumPy is the fundamental package for array computing with Python." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2128,7 +2062,6 @@ files = [ name = "nvidia-cublas-cu11" version = "11.10.3.66" description = "CUBLAS native runtime libraries" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2144,7 +2077,6 @@ wheel = "*" name = "nvidia-cuda-cupti-cu11" version = "11.7.101" description = "CUDA profiling tools runtime libs." -category = "main" optional = false python-versions = ">=3" files = [ @@ -2160,7 +2092,6 @@ wheel = "*" name = "nvidia-cuda-nvrtc-cu11" version = "11.7.99" description = "NVRTC native runtime libraries" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2177,7 +2108,6 @@ wheel = "*" name = "nvidia-cuda-runtime-cu11" version = "11.7.99" description = "CUDA Runtime native Libraries" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2193,7 +2123,6 @@ wheel = "*" name = "nvidia-cudnn-cu11" version = "8.5.0.96" description = "cuDNN runtime libraries" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2209,7 +2138,6 @@ wheel = "*" name = "nvidia-cufft-cu11" version = "10.9.0.58" description = "CUFFT native runtime libraries" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2221,7 +2149,6 @@ files = [ name = "nvidia-curand-cu11" version = "10.2.10.91" description = "CURAND native runtime libraries" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2237,7 +2164,6 @@ wheel = "*" name = "nvidia-cusolver-cu11" version = "11.4.0.1" description = "CUDA solver native runtime libraries" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2254,7 +2180,6 @@ wheel = "*" name = "nvidia-cusparse-cu11" version = "11.7.4.91" description = "CUSPARSE native runtime libraries" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2270,7 +2195,6 @@ wheel = "*" name = "nvidia-nccl-cu11" version = "2.14.3" description = "NVIDIA Collective Communication Library (NCCL) Runtime" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2281,7 +2205,6 @@ files = [ name = "nvidia-nvtx-cu11" version = "11.7.91" description = "NVIDIA Tools Extension" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2297,7 +2220,6 @@ wheel = "*" name = "oauthlib" version = "3.2.2" description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2314,7 +2236,6 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] name = "omegaconf" version = "2.0.6" description = "A flexible configuration library" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2330,7 +2251,6 @@ typing-extensions = "*" name = "orjson" version = "3.8.9" description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2389,7 +2309,6 @@ files = [ name = "packaging" version = "23.0" description = "Core utilities for Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2401,7 +2320,6 @@ files = [ name = "pandas" version = "2.0.0" description = "Powerful data structures for data analysis, time series, and statistics" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2436,6 +2354,7 @@ files = [ numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -2468,7 +2387,6 @@ xml = ["lxml (>=4.6.3)"] name = "pillow" version = "9.3.0" description = "Python Imaging Library (Fork)" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2543,7 +2461,6 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa name = "pkgutil-resolve-name" version = "1.3.10" description = "Resolve a name to an object." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2555,7 +2472,6 @@ files = [ name = "platformdirs" version = "3.2.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2571,7 +2487,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.2.2)", "pytest- name = "pooch" version = "1.7.0" description = "\"Pooch manages your Python library's sample data files: it automatically downloads and stores them in a local directory, with support for versioning and corruption checks.\"" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2593,7 +2508,6 @@ xxhash = ["xxhash (>=1.4.3)"] name = "portalocker" version = "2.7.0" description = "Wraps the portalocker recipe for easy usage" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -2613,7 +2527,6 @@ tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "p name = "praat-parselmouth" version = "0.4.3" description = "Praat in Python, the Pythonic way" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ @@ -2696,7 +2609,6 @@ numpy = ">=1.7.0" name = "protobuf" version = "3.20.3" description = "Protocol Buffers" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2728,7 +2640,6 @@ files = [ name = "pyasn1" version = "0.4.8" description = "ASN.1 types and codecs" -category = "main" optional = false python-versions = "*" files = [ @@ -2740,7 +2651,6 @@ files = [ name = "pyasn1-modules" version = "0.2.8" description = "A collection of ASN.1-based protocols modules." -category = "main" optional = false python-versions = "*" files = [ @@ -2755,7 +2665,6 @@ pyasn1 = ">=0.4.6,<0.5.0" name = "pycparser" version = "2.21" description = "C parser in Python" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -2767,7 +2676,6 @@ files = [ name = "pydantic" version = "1.10.7" description = "Data validation and settings management using python type hints" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2820,7 +2728,6 @@ email = ["email-validator (>=1.0.3)"] name = "pydub" version = "0.25.1" description = "Manipulate audio with an simple and easy high level interface" -category = "main" optional = false python-versions = "*" files = [ @@ -2828,11 +2735,24 @@ files = [ {file = "pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f"}, ] +[[package]] +name = "pygments" +version = "2.15.1" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, + {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, +] + +[package.extras] +plugins = ["importlib-metadata"] + [[package]] name = "pyparsing" version = "3.0.9" description = "pyparsing module - Classes and methods to define and execute parsing grammars" -category = "main" optional = false python-versions = ">=3.6.8" files = [ @@ -2847,7 +2767,6 @@ diagrams = ["jinja2", "railroad-diagrams"] name = "pyrsistent" version = "0.19.3" description = "Persistent/Functional/Immutable data structures" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2884,7 +2803,6 @@ files = [ name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -2899,7 +2817,6 @@ six = ">=1.5" name = "python-multipart" version = "0.0.6" description = "A streaming multipart parser for Python" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2914,7 +2831,6 @@ dev = ["atomicwrites (==1.2.1)", "attrs (==19.2.0)", "coverage (==6.5.0)", "hatc name = "pytz" version = "2023.3" description = "World timezone definitions, modern and historical" -category = "main" optional = false python-versions = "*" files = [ @@ -2926,7 +2842,6 @@ files = [ name = "pywin32" version = "306" description = "Python for Window Extensions" -category = "main" optional = false python-versions = "*" files = [ @@ -2950,7 +2865,6 @@ files = [ name = "pyworld" version = "0.3.2" description = "PyWorld is a Python wrapper for WORLD vocoder." -category = "main" optional = false python-versions = "*" files = [ @@ -2976,7 +2890,6 @@ test = ["nose"] name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3026,7 +2939,6 @@ files = [ name = "regex" version = "2023.3.23" description = "Alternative regular expression module, to replace re." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3096,7 +3008,6 @@ files = [ name = "requests" version = "2.31.0" description = "Python HTTP for Humans." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3118,7 +3029,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "requests-oauthlib" version = "1.3.1" description = "OAuthlib authentication support for Requests." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -3137,7 +3047,6 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] name = "resampy" version = "0.4.2" description = "Efficient signal resampling" -category = "main" optional = false python-versions = "*" files = [ @@ -3158,7 +3067,6 @@ tests = ["pytest (<8)", "pytest-cov", "scipy (>=1.0)"] name = "rfc3986" version = "1.5.0" description = "Validating URI References per RFC 3986" -category = "main" optional = false python-versions = "*" files = [ @@ -3176,7 +3084,6 @@ idna2008 = ["idna"] name = "rsa" version = "4.9" description = "Pure-Python RSA implementation" -category = "main" optional = false python-versions = ">=3.6,<4" files = [ @@ -3191,7 +3098,6 @@ pyasn1 = ">=0.1.3" name = "sacrebleu" version = "2.3.1" description = "Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3215,7 +3121,6 @@ ko = ["mecab-ko (==1.0.0)", "mecab-ko-dic (>=1.0,<2.0)"] name = "scikit-learn" version = "1.2.2" description = "A set of python modules for machine learning and data mining" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3258,7 +3163,6 @@ tests = ["black (>=22.3.0)", "flake8 (>=3.8.2)", "matplotlib (>=3.1.3)", "mypy ( name = "scipy" version = "1.9.3" description = "Fundamental algorithms for scientific computing in Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3297,7 +3201,6 @@ test = ["asv", "gmpy2", "mpmath", "pytest", "pytest-cov", "pytest-xdist", "sciki name = "semantic-version" version = "2.10.0" description = "A library implementing the 'SemVer' scheme." -category = "main" optional = false python-versions = ">=2.7" files = [ @@ -3313,7 +3216,6 @@ doc = ["Sphinx", "sphinx-rtd-theme"] name = "setuptools" version = "67.7.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3330,7 +3232,6 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs ( name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -3342,7 +3243,6 @@ files = [ name = "sniffio" version = "1.3.0" description = "Sniff out which async library your code is running under" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3354,7 +3254,6 @@ files = [ name = "soundfile" version = "0.12.1" description = "An audio library based on libsndfile, CFFI and NumPy" -category = "main" optional = false python-versions = "*" files = [ @@ -3378,7 +3277,6 @@ numpy = ["numpy"] name = "starlette" version = "0.27.0" description = "The little ASGI library that shines." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3397,7 +3295,6 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyam name = "sympy" version = "1.11.1" description = "Computer algebra system (CAS) in Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3412,7 +3309,6 @@ mpmath = ">=0.19" name = "tabulate" version = "0.9.0" description = "Pretty-print tabular data" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3427,7 +3323,6 @@ widechars = ["wcwidth"] name = "tensorboard" version = "2.12.1" description = "TensorBoard lets you watch Tensors Flow" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3453,7 +3348,6 @@ wheel = ">=0.26" name = "tensorboard-data-server" version = "0.7.0" description = "Fast data loading for TensorBoard" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3466,7 +3360,6 @@ files = [ name = "tensorboard-plugin-wit" version = "1.8.1" description = "What-If Tool TensorBoard plugin." -category = "main" optional = false python-versions = "*" files = [ @@ -3477,7 +3370,6 @@ files = [ name = "tensorboardx" version = "2.6" description = "TensorBoardX lets you watch Tensors Flow without Tensorflow" -category = "main" optional = false python-versions = "*" files = [ @@ -3494,7 +3386,6 @@ protobuf = ">=3.8.0,<4" name = "threadpoolctl" version = "3.1.0" description = "threadpoolctl" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3506,7 +3397,6 @@ files = [ name = "toolz" version = "0.12.0" description = "List processing tools and functional utilities" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -3518,7 +3408,6 @@ files = [ name = "torch" version = "2.0.0" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" -category = "main" optional = false python-versions = ">=3.8.0" files = [ @@ -3574,7 +3463,6 @@ opt-einsum = ["opt-einsum (>=3.3)"] name = "torchaudio" version = "2.0.1" description = "An audio package for PyTorch" -category = "main" optional = false python-versions = "*" files = [ @@ -3607,7 +3495,6 @@ torch = "2.0.0" name = "torchgen" version = "0.0.1" description = "Ready to use implementations of state-of-the-art generative models in PyTorch" -category = "main" optional = false python-versions = ">=3.7, <4" files = [ @@ -3618,7 +3505,6 @@ files = [ name = "tornado" version = "6.3.2" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." -category = "main" optional = false python-versions = ">= 3.8" files = [ @@ -3639,7 +3525,6 @@ files = [ name = "tqdm" version = "4.65.0" description = "Fast, Extensible Progress Meter" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3660,7 +3545,6 @@ telegram = ["requests"] name = "traitlets" version = "5.9.0" description = "Traitlets Python configuration system" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3676,7 +3560,6 @@ test = ["argcomplete (>=2.0)", "pre-commit", "pytest", "pytest-mock"] name = "triton" version = "2.0.0" description = "A language and compiler for custom Deep Learning operations" -category = "main" optional = false python-versions = "*" files = [ @@ -3714,7 +3597,6 @@ tutorials = ["matplotlib", "pandas", "tabulate"] name = "typing-extensions" version = "4.5.0" description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3726,7 +3608,6 @@ files = [ name = "tzdata" version = "2023.3" description = "Provider of IANA time zone data" -category = "main" optional = false python-versions = ">=2" files = [ @@ -3738,7 +3619,6 @@ files = [ name = "uc-micro-py" version = "1.0.1" description = "Micro subset of unicode data files for linkify-it-py projects." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3753,7 +3633,6 @@ test = ["coverage", "pytest", "pytest-cov"] name = "urllib3" version = "1.26.15" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -3770,7 +3649,6 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] name = "uvicorn" version = "0.21.1" description = "The lightning-fast ASGI server." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3789,7 +3667,6 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", name = "websockets" version = "11.0" description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3869,7 +3746,6 @@ files = [ name = "werkzeug" version = "2.2.3" description = "The comprehensive WSGI web application library." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3887,7 +3763,6 @@ watchdog = ["watchdog"] name = "wheel" version = "0.40.0" description = "A built-package format for Python" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3902,7 +3777,6 @@ test = ["pytest (>=6.0.0)"] name = "yarl" version = "1.8.2" description = "Yet another URL library" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3990,7 +3864,6 @@ multidict = ">=4.0" name = "zipp" version = "3.15.0" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4004,5 +3877,5 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" -python-versions = "^3.8,<3.11" -content-hash = "309a8ea8b898a89daf57e99dc285f7a36914aa7eb821b5a24253bc034277293e" +python-versions = "^3.8" +content-hash = "400ac506bf8f14333fa2e073fd39cc765a1941aab895d5ed6f9dd264146fc726" diff --git a/pyproject.toml b/pyproject.toml index 2e9b8d3..5b15258 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ python = "^3.8" torch = "^2.0.0" torchaudio = "^2.0.1" Cython = "^0.29.34" -gradio = "^3.24.1" +gradio = "^3.34.0" future = "^0.18.3" pydub = "^0.25.1" soundfile = "^0.12.1" @@ -53,6 +53,7 @@ absl-py = "^1.4.0" audioread = "^3.0.0" uvicorn = "^0.21.1" colorama = "^0.4.6" +torchcrepe = "0.0.20" [tool.poetry.dev-dependencies] diff --git a/requirements.txt b/requirements.txt index d1a5b2b..ccd755c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,6 @@ praat-parselmouth>=0.4.2 Pillow>=9.1.1 resampy>=0.4.2 scikit-learn -starlette>=0.25.0 tensorboard tensorboard-data-server tensorboard-plugin-wit @@ -43,5 +42,6 @@ uvicorn>=0.21.1 colorama>=0.4.5 pyworld>=0.3.2 httpx==0.23.0 -onnxruntime-gpu +#onnxruntime-gpu torchcrepe==0.0.20 +fastapi==0.88 diff --git a/rmvpe.py b/rmvpe.py new file mode 100644 index 0000000..17a748a --- /dev/null +++ b/rmvpe.py @@ -0,0 +1,432 @@ +import sys, torch, numpy as np, traceback, pdb +import torch.nn as nn +from time import time as ttime +import torch.nn.functional as F + + +class BiGRU(nn.Module): + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] + + +class ConvBlockRes(nn.Module): + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + self.is_shortcut = True + else: + self.is_shortcut = False + + def forward(self, x): + if self.is_shortcut: + return self.conv(x) + self.shortcut(x) + else: + return self.conv(x) + x + + +class Encoder(nn.Module): + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x): + concat_tensors = [] + x = self.bn(x) + for i in range(self.n_encoders): + _, x = self.layers[i](x) + concat_tensors.append(_) + return x, concat_tensors + + +class ResEncoderBlock(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i in range(self.n_blocks): + x = self.conv[i](x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +class Intermediate(nn.Module): # + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for i in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i in range(self.n_inters): + x = self.layers[i](x) + return x + + +class ResDecoderBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i in range(self.n_blocks): + x = self.conv2[i](x) + return x + + +class Decoder(nn.Module): + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for i in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x, concat_tensors): + for i in range(self.n_decoders): + x = self.layers[i](x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x): + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, 360), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + return x + + +from librosa.filters import mel + + +class MelSpectrogram(torch.nn.Module): + def __init__( + self, + is_half, + n_mel_channels, + sampling_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sampling_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sampling_rate = sampling_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + self.is_half = is_half + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device + ) + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + if self.is_half == True: + mel_output = mel_output.half() + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +class RMVPE: + def __init__(self, model_path, is_half, device=None): + self.resample_kernel = {} + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu") + model.load_state_dict(ckpt) + model.eval() + if is_half == True: + model = model.half() + self.model = model + self.resample_kernel = {} + self.is_half = is_half + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + self.mel_extractor = MelSpectrogram( + is_half, 128, 16000, 1024, 160, None, 30, 8000 + ).to(device) + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(360) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 + + def mel2hidden(self, mel): + with torch.no_grad(): + n_frames = mel.shape[-1] + mel = F.pad( + mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" + ) + hidden = self.model(mel) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) + return f0 + + def infer_from_audio(self, audio, thred=0.03): + audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) + # torch.cuda.synchronize() + # t0=ttime() + mel = self.mel_extractor(audio, center=True) + # torch.cuda.synchronize() + # t1=ttime() + hidden = self.mel2hidden(mel) + # torch.cuda.synchronize() + # t2=ttime() + hidden = hidden.squeeze(0).cpu().numpy() + if self.is_half == True: + hidden = hidden.astype("float32") + f0 = self.decode(hidden, thred=thred) + # torch.cuda.synchronize() + # t3=ttime() + # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + # t0 = ttime() + center = np.argmax(salience, axis=1) # 帧长#index + salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 + # t1 = ttime() + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + # t2 = ttime() + todo_salience = np.array(todo_salience) # 帧长,9 + todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) # 帧长 + devided = product_sum / weight_sum # 帧长 + # t3 = ttime() + maxx = np.max(salience, axis=1) # 帧长 + devided[maxx <= thred] = 0 + # t4 = ttime() + # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + return devided + + +# if __name__ == '__main__': +# audio, sampling_rate = sf.read("卢本伟语录~1.wav") +# if len(audio.shape) > 1: +# audio = librosa.to_mono(audio.transpose(1, 0)) +# audio_bak = audio.copy() +# if sampling_rate != 16000: +# audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) +# model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/test-RMVPE/weights/rmvpe_llc_half.pt" +# thred = 0.03 # 0.01 +# device = 'cuda' if torch.cuda.is_available() else 'cpu' +# rmvpe = RMVPE(model_path,is_half=False, device=device) +# t0=ttime() +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# f0 = rmvpe.infer_from_audio(audio, thred=thred) +# t1=ttime() +# print(f0.shape,t1-t0) diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..9f7186e --- /dev/null +++ b/run.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +if [[ "$(uname)" == "Darwin" ]]; then + # macOS specific env: + export PYTORCH_ENABLE_MPS_FALLBACK=1 + export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 +elif [[ "$(uname)" != "Linux" ]]; then + echo "Unsupported operating system." + exit 1 +fi + +requirements_file="requirements.txt" + +# Check if Python 3.8 is installed +if ! command -v python3.8 &> /dev/null; then + echo "Python 3.8 not found. Attempting to install..." + if [[ "$(uname)" == "Darwin" ]] && command -v brew &> /dev/null; then + brew install python@3.8 + elif [[ "$(uname)" == "Linux" ]] && command -v apt-get &> /dev/null; then + sudo apt-get update + sudo apt-get install python3.8 + else + echo "Please install Python 3.8 manually." + exit 1 + fi +fi + +# Check if required packages are installed and install them if not +if [ -f "${requirements_file}" ]; then + installed_packages=$(python3.8 -m pip freeze) + while IFS= read -r package; do + [[ "${package}" =~ ^#.* ]] && continue + package_name=$(echo "${package}" | sed 's/[<>=!].*//') + if ! echo "${installed_packages}" | grep -q "${package_name}"; then + echo "${package_name} not found. Attempting to install..." + python3.8 -m pip install --upgrade "${package}" + fi + done < "${requirements_file}" +else + echo "${requirements_file} not found. Please ensure the requirements file with required packages exists." + exit 1 +fi + +# Run the main script +python3.8 infer-web.py --pycmd python3.8 \ No newline at end of file diff --git a/rvc_for_realtime.py b/rvc_for_realtime.py new file mode 100644 index 0000000..4d62861 --- /dev/null +++ b/rvc_for_realtime.py @@ -0,0 +1,297 @@ +import faiss, torch, traceback, parselmouth, numpy as np, torchcrepe, torch.nn as nn, pyworld +from fairseq import checkpoint_utils +from lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +import os, sys +from time import time as ttime +import torch.nn.functional as F +import scipy.signal as signal + +now_dir = os.getcwd() +sys.path.append(now_dir) +from config import Config +from multiprocessing import Manager as M + +mm = M() +config = Config() + + +class RVC: + def __init__( + self, key, pth_path, index_path, index_rate, n_cpu, inp_q, opt_q, device + ) -> None: + """ + 初始化 + """ + try: + global config + self.inp_q = inp_q + self.opt_q = opt_q + self.device = device + self.f0_up_key = key + self.time_step = 160 / 16000 * 1000 + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.sr = 16000 + self.window = 160 + self.n_cpu = n_cpu + if index_rate != 0: + self.index = faiss.read_index(index_path) + self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) + print("index search enabled") + self.index_rate = index_rate + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + ["hubert_base.pt"], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(config.device) + if config.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + hubert_model.eval() + self.model = hubert_model + cpt = torch.load(pth_path, map_location="cpu") + self.tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + self.if_f0 = cpt.get("f0", 1) + self.version = cpt.get("version", "v1") + if self.version == "v1": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs256NSFsid( + *cpt["config"], is_half=config.is_half + ) + else: + self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif self.version == "v2": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs768NSFsid( + *cpt["config"], is_half=config.is_half + ) + else: + self.net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) + del self.net_g.enc_q + print(self.net_g.load_state_dict(cpt["weight"], strict=False)) + self.net_g.eval().to(device) + if config.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + self.is_half = config.is_half + except: + print(traceback.format_exc()) + + def get_f0_post(self, f0): + f0_min = self.f0_min + f0_max = self.f0_max + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0bak + + def get_f0(self, x, f0_up_key, n_cpu, method="harvest"): + n_cpu = int(n_cpu) + if method == "crepe": + return self.get_f0_crepe(x, f0_up_key) + if method == "rmvpe": + return self.get_f0_rmvpe(x, f0_up_key) + if method == "pm": + p_len = x.shape[0] // 160 + f0 = ( + parselmouth.Sound(x, 16000) + .to_pitch_ac( + time_step=0.01, + voicing_threshold=0.6, + pitch_floor=50, + pitch_ceiling=1100, + ) + .selected_array["frequency"] + ) + + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + print(pad_size, p_len - len(f0) - pad_size) + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + if n_cpu == 1: + f0, t = pyworld.harvest( + x.astype(np.double), + fs=16000, + f0_ceil=1100, + f0_floor=50, + frame_period=10, + ) + f0 = signal.medfilt(f0, 3) + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + f0bak = np.zeros(x.shape[0] // 160, dtype=np.float64) + length = len(x) + part_length = int(length / n_cpu / 160) * 160 + ts = ttime() + res_f0 = mm.dict() + for idx in range(n_cpu): + tail = part_length * (idx + 1) + 320 + if idx == 0: + self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) + else: + self.inp_q.put( + (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) + ) + while 1: + res_ts = self.opt_q.get() + if res_ts == ts: + break + f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])] + for idx, f0 in enumerate(f0s): + if idx == 0: + f0 = f0[:-3] + elif idx != n_cpu - 1: + f0 = f0[2:-3] + else: + f0 = f0[2:-1] + f0bak[ + part_length * idx // 160 : part_length * idx // 160 + f0.shape[0] + ] = f0 + f0bak = signal.medfilt(f0bak, 3) + f0bak *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0bak) + + def get_f0_crepe(self, x, f0_up_key): + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + 160, + self.f0_min, + self.f0_max, + "full", + batch_size=512, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def get_f0_rmvpe(self, x, f0_up_key): + if hasattr(self, "model_rmvpe") == False: + from rmvpe import RMVPE + + print("loading rmvpe model") + self.model_rmvpe = RMVPE( + "rmvpe.pt", is_half=self.is_half, device=self.device + ) + # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def infer( + self, + feats: torch.Tensor, + indata: np.ndarray, + rate1, + rate2, + cache_pitch, + cache_pitchf, + f0method, + ) -> np.ndarray: + feats = feats.view(1, -1) + if config.is_half: + feats = feats.half() + else: + feats = feats.float() + feats = feats.to(self.device) + t1 = ttime() + with torch.no_grad(): + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + inputs = { + "source": feats, + "padding_mask": padding_mask, + "output_layer": 9 if self.version == "v1" else 12, + } + logits = self.model.extract_features(**inputs) + feats = ( + self.model.final_proj(logits[0]) if self.version == "v1" else logits[0] + ) + t2 = ttime() + try: + if hasattr(self, "index") and self.index_rate != 0: + leng_replace_head = int(rate1 * feats[0].shape[0]) + npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32") + score, ix = self.index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + if config.is_half: + npy = npy.astype("float16") + feats[0][-leng_replace_head:] = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate + + (1 - self.index_rate) * feats[0][-leng_replace_head:] + ) + else: + print("index search FAIL or disabled") + except: + traceback.print_exc() + print("index search FAIL") + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + t3 = ttime() + if self.if_f0 == 1: + pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method) + cache_pitch[:] = np.append(cache_pitch[pitch[:-1].shape[0] :], pitch[:-1]) + cache_pitchf[:] = np.append( + cache_pitchf[pitchf[:-1].shape[0] :], pitchf[:-1] + ) + p_len = min(feats.shape[1], 13000, cache_pitch.shape[0]) + else: + cache_pitch, cache_pitchf = None, None + p_len = min(feats.shape[1], 13000) + t4 = ttime() + feats = feats[:, :p_len, :] + if self.if_f0 == 1: + cache_pitch = cache_pitch[:p_len] + cache_pitchf = cache_pitchf[:p_len] + cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device) + cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device) + p_len = torch.LongTensor([p_len]).to(self.device) + ii = 0 # sid + sid = torch.LongTensor([ii]).to(self.device) + with torch.no_grad(): + if self.if_f0 == 1: + infered_audio = ( + self.net_g.infer( + feats, p_len, cache_pitch, cache_pitchf, sid, rate2 + )[0][0, 0] + .data.cpu() + .float() + ) + else: + infered_audio = ( + self.net_g.infer(feats, p_len, sid, rate2)[0][0, 0] + .data.cpu() + .float() + ) + t5 = ttime() + print("time->fea-index-f0-model:", t2 - t1, t3 - t2, t4 - t3, t5 - t4) + return infered_audio diff --git a/envfilescheck.bat b/tools/dlmodels.bat similarity index 100% rename from envfilescheck.bat rename to tools/dlmodels.bat diff --git a/tools/dlmodels.sh b/tools/dlmodels.sh new file mode 100755 index 0000000..0ae7f7e --- /dev/null +++ b/tools/dlmodels.sh @@ -0,0 +1,546 @@ +#!/bin/bash + +echo working dir is $(pwd) +echo downloading requirement aria2 check. + +if command -v aria2c &> /dev/null +then + echo "aria2c command found" +else + echo failed. please install aria2 + sleep 5 + exit 1 +fi + +d32="f0D32k.pth" +d40="f0D40k.pth" +d48="f0D48k.pth" +g32="f0G32k.pth" +g40="f0G40k.pth" +g48="f0G48k.pth" + +d40v2="f0D40k.pth" +g40v2="f0G40k.pth" + +dld32="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth" +dld40="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth" +dld48="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth" +dlg32="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth" +dlg40="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth" +dlg48="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth" + +dld40v2="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth" +dlg40v2="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth" + +hp2_all="HP2_all_vocals.pth" +hp3_all="HP3_all_vocals.pth" +hp5_only="HP5_only_main_vocal.pth" +VR_DeEchoAggressive="VR-DeEchoAggressive.pth" +VR_DeEchoDeReverb="VR-DeEchoDeReverb.pth" +VR_DeEchoNormal="VR-DeEchoNormal.pth" +onnx_dereverb="vocals.onnx" + +dlhp2_all="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth" +dlhp3_all="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth" +dlhp5_only="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth" +dlVR_DeEchoAggressive="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth" +dlVR_DeEchoDeReverb="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth" +dlVR_DeEchoNormal="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth" +dlonnx_dereverb="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx" + +hb="hubert_base.pt" + +dlhb="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt" + +echo dir check start. + +if [ -d "./pretrained" ]; then + echo dir ./pretrained checked. +else + echo failed. generating dir ./pretrained. + mkdir pretrained +fi + +if [ -d "./pretrained_v2" ]; then + echo dir ./pretrained_v2 checked. +else + echo failed. generating dir ./pretrained_v2. + mkdir pretrained_v2 +fi + +if [ -d "./uvr5_weights" ]; then + echo dir ./uvr5_weights checked. +else + echo failed. generating dir ./uvr5_weights. + mkdir uvr5_weights +fi + +if [ -d "./uvr5_weights/onnx_dereverb_By_FoxJoy" ]; then + echo dir ./uvr5_weights/onnx_dereverb_By_FoxJoy checked. +else + echo failed. generating dir ./uvr5_weights/onnx_dereverb_By_FoxJoy. + mkdir uvr5_weights/onnx_dereverb_By_FoxJoy +fi + +echo dir check finished. + +echo required files check start. + +echo checking D32k.pth +if [ -f "./pretrained/D32k.pth" ]; then + echo D32k.pth in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d ./pretrained -o D32k.pth + if [ -f "./pretrained/D32k.pth" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking D40k.pth +if [ -f "./pretrained/D40k.pth" ]; then + echo D40k.pth in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d ./pretrained -o D40k.pth + if [ -f "./pretrained/D40k.pth" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking D40k.pth +if [ -f "./pretrained_v2/D40k.pth" ]; then + echo D40k.pth in ./pretrained_v2 checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d ./pretrained_v2 -o D40k.pth + if [ -f "./pretrained_v2/D40k.pth" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking D48k.pth +if [ -f "./pretrained/D48k.pth" ]; then + echo D48k.pth in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d ./pretrained -o D48k.pth + if [ -f "./pretrained/D48k.pth" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking G32k.pth +if [ -f "./pretrained/G32k.pth" ]; then + echo G32k.pth in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d ./pretrained -o G32k.pth + if [ -f "./pretrained/G32k.pth" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking G40k.pth +if [ -f "./pretrained/G40k.pth" ]; then + echo G40k.pth in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d ./pretrained -o G40k.pth + if [ -f "./pretrained/G40k.pth" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking G40k.pth +if [ -f "./pretrained_v2/G40k.pth" ]; then + echo G40k.pth in ./pretrained_v2 checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d ./pretrained_v2 -o G40k.pth + if [ -f "./pretrained_v2/G40k.pth" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking G48k.pth +if [ -f "./pretrained/G48k.pth" ]; then + echo G48k.pth in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d ./pretrained -o G48k.pth + if [ -f "./pretrained/G48k.pth" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $d32 +if [ -f "./pretrained/$d32" ]; then + echo $d32 in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld32 -d ./pretrained -o $d32 + if [ -f "./pretrained/$d32" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $d40 +if [ -f "./pretrained/$d40" ]; then + echo $d40 in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld40 -d ./pretrained -o $d40 + if [ -f "./pretrained/$d40" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $d40v2 +if [ -f "./pretrained_v2/$d40v2" ]; then + echo $d40v2 in ./pretrained_v2 checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld40v2 -d ./pretrained_v2 -o $d40v2 + if [ -f "./pretrained_v2/$d40v2" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $d48 +if [ -f "./pretrained/$d48" ]; then + echo $d48 in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld48 -d ./pretrained -o $d48 + if [ -f "./pretrained/$d48" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $g32 +if [ -f "./pretrained/$g32" ]; then + echo $g32 in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg32 -d ./pretrained -o $g32 + if [ -f "./pretrained/$g32" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $g40 +if [ -f "./pretrained/$g40" ]; then + echo $g40 in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg40 -d ./pretrained -o $g40 + if [ -f "./pretrained/$g40" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $g40v2 +if [ -f "./pretrained_v2/$g40v2" ]; then + echo $g40v2 in ./pretrained_v2 checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg40v2 -d ./pretrained_v2 -o $g40v2 + if [ -f "./pretrained_v2/$g40v2" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $g48 +if [ -f "./pretrained/$g48" ]; then + echo $g48 in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg48 -d ./pretrained -o $g48 + if [ -f "./pretrained/$g48" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $hp2_all +if [ -f "./uvr5_weights/$hp2_all" ]; then + echo $hp2_all in ./uvr5_weights checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhp2_all -d ./uvr5_weights -o $hp2_all + if [ -f "./uvr5_weights/$hp2_all" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $hp3_all +if [ -f "./uvr5_weights/$hp3_all" ]; then + echo $hp3_all in ./uvr5_weights checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhp3_all -d ./uvr5_weights -o $hp3_all + if [ -f "./uvr5_weights/$hp3_all" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $hp5_only +if [ -f "./uvr5_weights/$hp5_only" ]; then + echo $hp5_only in ./uvr5_weights checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhp5_only -d ./uvr5_weights -o $hp5_only + if [ -f "./uvr5_weights/$hp5_only" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $VR_DeEchoAggressive +if [ -f "./uvr5_weights/$VR_DeEchoAggressive" ]; then + echo $VR_DeEchoAggressive in ./uvr5_weights checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlVR_DeEchoAggressive -d ./uvr5_weights -o $VR_DeEchoAggressive + if [ -f "./uvr5_weights/$VR_DeEchoAggressive" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $VR_DeEchoDeReverb +if [ -f "./uvr5_weights/$VR_DeEchoDeReverb" ]; then + echo $VR_DeEchoDeReverb in ./uvr5_weights checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlVR_DeEchoDeReverb -d ./uvr5_weights -o $VR_DeEchoDeReverb + if [ -f "./uvr5_weights/$VR_DeEchoDeReverb" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $VR_DeEchoNormal +if [ -f "./uvr5_weights/$VR_DeEchoNormal" ]; then + echo $VR_DeEchoNormal in ./uvr5_weights checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlVR_DeEchoNormal -d ./uvr5_weights -o $VR_DeEchoNormal + if [ -f "./uvr5_weights/$VR_DeEchoNormal" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $onnx_dereverb +if [ -f "./uvr5_weights/onnx_dereverb_By_FoxJoy/$onnx_dereverb" ]; then + echo $onnx_dereverb in ./uvr5_weights/onnx_dereverb_By_FoxJoy checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlonnx_dereverb -d ./uvr5_weights/onnx_dereverb_By_FoxJoy -o $onnx_dereverb + if [ -f "./uvr5_weights/onnx_dereverb_By_FoxJoy/$onnx_dereverb" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo checking $hb +if [ -f "./pretrained/$hb" ]; then + echo $hb in ./pretrained checked. +else + echo failed. starting download from huggingface. + if command -v aria2c &> /dev/null; then + aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhb -d ./ -o $hb + if [ -f "./$hb" ]; then + echo download successful. + else + echo please try again! + exit 1 + fi + else + echo aria2c command not found. Please install aria2c and try again. + exit 1 + fi +fi + +echo required files check finished. +read -p "Press any key to continue..." -n1 -s diff --git a/export_onnx.py b/tools/export_onnx.py similarity index 94% rename from export_onnx.py rename to tools/export_onnx.py index 34938fe..2d334a6 100644 --- a/export_onnx.py +++ b/tools/export_onnx.py @@ -1,4 +1,4 @@ -from infer_pack.models_onnx import SynthesizerTrnMsNSFsidM +from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM import torch if __name__ == "__main__": diff --git a/infer/infer-pm-index256.py b/tools/infer/infer-pm-index256.py similarity index 93% rename from infer/infer-pm-index256.py rename to tools/infer/infer-pm-index256.py index 5060345..d182e20 100644 --- a/infer/infer-pm-index256.py +++ b/tools/infer/infer-pm-index256.py @@ -9,12 +9,12 @@ import numpy as np import soundfile as sf # from models import SynthesizerTrn256#hifigan_nonsf -# from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf -from infer_pack.models import ( +# from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf +from lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid as SynthesizerTrn256, ) # hifigan_nsf -# from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf +# from lib.infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf diff --git a/infer/train-index -v2.py b/tools/infer/train-index-v2.py similarity index 54% rename from infer/train-index -v2.py rename to tools/infer/train-index-v2.py index 656a5a6..77dfa0b 100644 --- a/infer/train-index -v2.py +++ b/tools/infer/train-index-v2.py @@ -2,9 +2,15 @@ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 """ import faiss, numpy as np, os +from sklearn.cluster import MiniBatchKMeans +import traceback +from multiprocessing import cpu_count # ###########如果是原始特征要先写save -inp_root = r"./logs/nene/3_feature768" +n_cpu = 0 +if n_cpu == 0: + n_cpu = cpu_count() +inp_root = r"./logs/anz/3_feature768" npys = [] listdir_res = list(os.listdir(inp_root)) for name in sorted(listdir_res): @@ -15,7 +21,27 @@ big_npy_idx = np.arange(big_npy.shape[0]) np.random.shuffle(big_npy_idx) big_npy = big_npy[big_npy_idx] print(big_npy.shape) # (6196072, 192)#fp32#4.43G -np.save("infer/big_src_feature_mi.npy", big_npy) +if big_npy.shape[0] > 2e5: + # if(1): + info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0] + print(info) + try: + big_npy = ( + MiniBatchKMeans( + n_clusters=10000, + verbose=True, + batch_size=256 * n_cpu, + compute_labels=False, + init="random", + ) + .fit(big_npy) + .cluster_centers_ + ) + except: + info = traceback.format_exc() + print(info) + +np.save("tools/infer/big_src_feature_mi.npy", big_npy) ##################train+add # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy") @@ -26,13 +52,15 @@ index_ivf = faiss.extract_index_ivf(index) # index_ivf.nprobe = 1 index.train(big_npy) faiss.write_index( - index, "infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf) + index, "tools/infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf) ) print("adding") batch_size_add = 8192 for i in range(0, big_npy.shape[0], batch_size_add): index.add(big_npy[i : i + batch_size_add]) -faiss.write_index(index, "infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf)) +faiss.write_index( + index, "tools/infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf) +) """ 大小(都是FP32) big_src_feature 2.95G diff --git a/infer/train-index.py b/tools/infer/train-index.py similarity index 100% rename from infer/train-index.py rename to tools/infer/train-index.py diff --git a/infer/trans_weights.py b/tools/infer/trans_weights.py similarity index 100% rename from infer/trans_weights.py rename to tools/infer/trans_weights.py diff --git a/onnx_inference_demo.py b/tools/onnx_inference_demo.py similarity index 90% rename from onnx_inference_demo.py rename to tools/onnx_inference_demo.py index 38bc882..a4a9490 100644 --- a/onnx_inference_demo.py +++ b/tools/onnx_inference_demo.py @@ -1,5 +1,5 @@ import soundfile -from infer_pack.onnx_inference import OnnxRVC +from ..lib.infer_pack.onnx_inference import OnnxRVC hop_size = 512 sampling_rate = 40000 # 采样率 diff --git a/train/process_ckpt.py b/train/process_ckpt.py index 1535d27..8f9c3d7 100644 --- a/train/process_ckpt.py +++ b/train/process_ckpt.py @@ -92,7 +92,7 @@ def extract_small_model(path, name, sr, if_f0, info, version): 40000, ] elif sr == "48k": - if(version=="v1"): + if version == "v1": opt["config"] = [ 1025, 32, @@ -127,15 +127,15 @@ def extract_small_model(path, name, sr, if_f0, info, version): "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [12,10,2,2], + [12, 10, 2, 2], 512, - [24,20,4,4], + [24, 20, 4, 4], 109, 256, 48000, ] elif sr == "32k": - if(version=="v1"): + if version == "v1": opt["config"] = [ 513, 32, @@ -170,9 +170,9 @@ def extract_small_model(path, name, sr, if_f0, info, version): "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10,8,2,2], + [10, 8, 2, 2], 512, - [20,16,4,4], + [20, 16, 4, 4], 109, 256, 32000, diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/train_nsf_sim_cache_sid_load_pretrain.py index 2949bc4..3394bdd 100644 --- a/train_nsf_sim_cache_sid_load_pretrain.py +++ b/train_nsf_sim_cache_sid_load_pretrain.py @@ -22,7 +22,7 @@ import torch.multiprocessing as mp import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.cuda.amp import autocast, GradScaler -from infer_pack import commons +from lib.infer_pack import commons from time import sleep from time import time as ttime from data_utils import ( @@ -34,13 +34,13 @@ from data_utils import ( ) if hps.version == "v1": - from infer_pack.models import ( + from lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid as RVC_Model_f0, SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0, MultiPeriodDiscriminator, ) else: - from infer_pack.models import ( + from lib.infer_pack.models import ( SynthesizerTrnMs768NSFsid as RVC_Model_f0, SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0, MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator, diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index 20a629d..f16761a 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -41,7 +41,7 @@ class PreProcess: ) self.sr = sr self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) - self.per = 3.7 + self.per = 3.0 self.overlap = 0.3 self.tail = self.per + self.overlap self.max = 0.9 diff --git a/venv.sh b/venv.sh new file mode 100755 index 0000000..17f58bf --- /dev/null +++ b/venv.sh @@ -0,0 +1 @@ +python3 -m venv .venv