application/utils.py

import logging
import os
import io
from datetime import datetime
import traceback
import shutil
import zipfile
import librosa
from flask import send_file
import resampy  # noqa

from main import socketio
from dataset.audio_processing import convert_audio
from dataset.analysis import save_dataset_info
from dataset.clip_generator import CHARACTER_ENCODING


class SocketIOHandler(logging.Handler):
    """
    Sends logger messages to the frontend using flask-socketio.
    These are handled in application.js
    """

    def emit(self, record):
        text = record.getMessage()
        if text.startswith("Progress"):
            text = text.split("-")[1]
            current, total = text.split("/")
            socketio.emit("progress", {"number": current, "total": total}, namespace="/voice")
        elif text.startswith("Status"):
            socketio.emit("status", {"text": text.replace("Status -", "")}, namespace="/voice")
        elif text.startswith("Alignment"):
            text = text.split("- ")[1]
            iteration, image = text.split(", ")
            socketio.emit("alignment", {"iteration": iteration, "image": image}, namespace="/voice")
        else:
            socketio.emit("logs", {"text": text}, namespace="/voice")


# Data
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("voice")
logger.addHandler(SocketIOHandler())
thread = None


def background_task(func, **kwargs):
    """
    Runs a background task.
    If function errors out it will send an error log to the error logging server and page.
    Sends 'done' message to frontend when complete.

    Parameters
    ----------
    func : function
        Function to run in background
    kwargs : kwargs
        Kwargs to pass to function
    """
    try:
        socketio.sleep(5)
        func(logging=logger, **kwargs)
    except Exception as e:
        error = {"type": e.__class__.__name__, "text": str(e), "stacktrace": traceback.format_exc()}
        socketio.emit("error", error, namespace="/voice")
        raise e

    socketio.emit("done", {"text": None}, namespace="/voice")


def start_progress_thread(func, **kwargs):
    """
    Starts a background task using socketio.

    Parameters
    ----------
    func : function
        Function to run in background
    kwargs : kwargs
        Kwargs to pass to function
    """
    global thread
    print("Starting Thread")
    thread = socketio.start_background_task(background_task, func=func, **kwargs)


def serve_file(path, filename, mimetype, as_attachment=True):
    with open(path, "rb") as f:
        return send_file(
            io.BytesIO(f.read()), attachment_filename=filename, mimetype=mimetype, as_attachment=as_attachment
        )


def get_next_url(urls, path):
    """
    Returns the URL of the next step in the voice cloning process.

    Parameters
    ----------
    urls : dict
        Frontend url paths and names
    path : str
        Current URL

    Returns
    -------
    str
        URL of next step or '' if not found
    """
    urls = list(urls.keys())
    next_url_index = urls.index(path) + 1
    return urls[next_url_index] if next_url_index < len(urls) else ""


def get_suffix():
    """
    Generates a filename suffix using the currrent datetime.

    Returns
    -------
    str
        String suffix
    """
    return datetime.now().strftime("%d-%m-%Y_%H-%M-%S")


def delete_folder(path):
    """
    Deletes a folder.

    Parameters
    ----------
    path : str
        Path to folder

    Raises
    -------
    AssertionError
        If folder is not found
    """
    assert os.path.isdir(path), f"{path} does not exist"
    shutil.rmtree(path)


def import_dataset(dataset, dataset_directory, audio_folder, logging):
    """
    Imports a dataset zip into the app.
    Checks required files are present, saves the files,
    converts the audio to the required format and generates the info file.
    Deletes given zip regardless of success.

    Parameters
    ----------
    dataset : str
        Path to dataset zip
    dataset_directory : str
        Destination path for the dataset
    audio_folder : str
        Destination path for the dataset audio
    logging : logging
        Logging object to write logs to

    Raises
    -------
    AssertionError
        If files are missing or invalid
    """
    try:
        with zipfile.ZipFile(dataset, mode="r") as z:
            files_list = z.namelist()
            assert (
                "metadata.csv" in files_list
            ), "Dataset missing metadata.csv. Make sure this file is in the root of the zip file"

            folders = [x.split("/")[0] for x in files_list if "/" in x]
            assert (
                "wavs" in folders
            ), "Dataset missing wavs folder. Make sure this folder is in the root of the zip file"

            wavs = [x for x in files_list if x.startswith("wavs/") and x.endswith(".wav")]
            assert wavs, "No wavs found in wavs folder"

            metadata = z.read("metadata.csv").decode(CHARACTER_ENCODING, "ignore").replace("\r\n", "\n")
            num_metadata_rows = len([row for row in metadata.split("\n") if row])
            assert (
                len(wavs) == num_metadata_rows
            ), f"Number of wavs and labels do not match. metadata: {num_metadata_rows}, wavs: {len(wavs)}"

            logging.info("Creating directory")
            os.makedirs(dataset_directory, exist_ok=False)
            os.makedirs(audio_folder, exist_ok=False)

            # Save metadata
            logging.info("Saving files")
            with open(os.path.join(dataset_directory, "metadata.csv"), "w", encoding=CHARACTER_ENCODING) as f:
                f.write(metadata)

            # Save wavs
            total_wavs = len(wavs)
            clip_lengths = []
            filenames = {}
            for i in range(total_wavs):
                wav = wavs[i]
                data = z.read(wav)
                path = os.path.join(dataset_directory, "wavs", wav.split("/")[1])
                with open(path, "wb") as f:
                    f.write(data)
                    new_path = convert_audio(path)
                    duration = librosa.get_duration(filename=new_path)
                    clip_lengths.append(duration)
                    filenames[path] = new_path
                logging.info(f"Progress - {i+1}/{total_wavs}")

            logging.info(f"Longest clip: {max(clip_lengths)}s, Shortest clip: {min(clip_lengths)}s")
            # Get around "file in use" by using delay
            logging.info("Deleting temp files")
            for old_path, new_path in filenames.items():
                os.remove(old_path)
                os.rename(new_path, old_path)

            # Create info file
            logging.info("Creating info file")
            save_dataset_info(
                os.path.join(dataset_directory, "metadata.csv"),
                os.path.join(dataset_directory, "wavs"),
                os.path.join(dataset_directory, "info.json"),
                clip_lengths=clip_lengths,
            )
    except Exception as e:
        os.remove(dataset)
        raise e

    os.remove(dataset)
init 2021-03-10 15:56:40 +00:00			`import logging`
			`import os`
Download alignment timelapse 2021-09-18 13:23:16 +01:00			`import io`
init 2021-03-10 15:56:40 +00:00			`from datetime import datetime`
Add compression to app 2021-03-24 20:30:25 +00:00			`import traceback`
Add settings menu 2021-04-04 18:52:36 +01:00			`import shutil`
Improve dataset import verification 2021-05-02 17:09:05 +01:00			`import zipfile`
			`import librosa`
Download alignment timelapse 2021-09-18 13:23:16 +01:00			`from flask import send_file`
Add install script 2021-08-08 21:43:24 +01:00			`import resampy # noqa`
init 2021-03-10 15:56:40 +00:00
			`from main import socketio`
Clip generation rework 2021-04-01 19:12:38 +01:00			`from dataset.audio_processing import convert_audio`
Add dataset info file 2021-04-07 20:26:51 +01:00			`from dataset.analysis import save_dataset_info`
Remove clip length check in dataset importing 2021-08-29 19:27:28 +01:00			`from dataset.clip_generator import CHARACTER_ENCODING`
init 2021-03-10 15:56:40 +00:00

			`class SocketIOHandler(logging.Handler):`
Update comments 2021-05-05 21:13:58 +01:00			`"""`
			`Sends logger messages to the frontend using flask-socketio.`
			`These are handled in application.js`
			`"""`

init 2021-03-10 15:56:40 +00:00			`def emit(self, record):`
			`text = record.getMessage()`
			`if text.startswith("Progress"):`
			`text = text.split("-")[1]`
			`current, total = text.split("/")`
			`socketio.emit("progress", {"number": current, "total": total}, namespace="/voice")`
			`elif text.startswith("Status"):`
			`socketio.emit("status", {"text": text.replace("Status -", "")}, namespace="/voice")`
init 2021-09-17 17:06:15 +01:00			`elif text.startswith("Alignment"):`
			`text = text.split("- ")[1]`
			`iteration, image = text.split(", ")`
			`socketio.emit("alignment", {"iteration": iteration, "image": image}, namespace="/voice")`
init 2021-03-10 15:56:40 +00:00			`else:`
			`socketio.emit("logs", {"text": text}, namespace="/voice")`


			`# Data`
			`logging.basicConfig(level=logging.INFO)`
			`logger = logging.getLogger("voice")`
			`logger.addHandler(SocketIOHandler())`
			`thread = None`


			`def background_task(func, **kwargs):`
Update comments 2021-05-05 21:13:58 +01:00			`"""`
			`Runs a background task.`
add comments 2021-04-20 12:23:11 +01:00			`If function errors out it will send an error log to the error logging server and page.`
			`Sends 'done' message to frontend when complete.`

			`Parameters`
			`----------`
			`func : function`
			`Function to run in background`
			`kwargs : kwargs`
			`Kwargs to pass to function`
			`"""`
init 2021-03-10 15:56:40 +00:00			`try:`
			`socketio.sleep(5)`
			`func(logging=logger, **kwargs)`
			`except Exception as e:`
Add error logging 2021-03-22 19:34:45 +00:00			`error = {"type": e.__class__.__name__, "text": str(e), "stacktrace": traceback.format_exc()}`
			`socketio.emit("error", error, namespace="/voice")`
Improve dataset import verification 2021-05-02 17:09:05 +01:00			`raise e`
init 2021-03-10 15:56:40 +00:00
Improve dataset import verification 2021-05-02 17:09:05 +01:00			`socketio.emit("done", {"text": None}, namespace="/voice")`
init 2021-03-10 15:56:40 +00:00

			`def start_progress_thread(func, **kwargs):`
Update comments 2021-05-05 21:13:58 +01:00			`"""`
			`Starts a background task using socketio.`
add comments 2021-04-20 12:23:11 +01:00
			`Parameters`
			`----------`
			`func : function`
			`Function to run in background`
			`kwargs : kwargs`
			`Kwargs to pass to function`
			`"""`
init 2021-03-10 15:56:40 +00:00			`global thread`
			`print("Starting Thread")`
			`thread = socketio.start_background_task(background_task, func=func, **kwargs)`


Fix caching issue 2021-09-18 15:59:23 +01:00			`def serve_file(path, filename, mimetype, as_attachment=True):`
Download alignment timelapse 2021-09-18 13:23:16 +01:00			`with open(path, "rb") as f:`
Add comments 2021-09-18 16:33:43 +01:00			`return send_file(`
			`io.BytesIO(f.read()), attachment_filename=filename, mimetype=mimetype, as_attachment=as_attachment`
			`)`
Download alignment timelapse 2021-09-18 13:23:16 +01:00

init 2021-03-10 15:56:40 +00:00			`def get_next_url(urls, path):`
Update comments 2021-05-05 21:13:58 +01:00			`"""`
			`Returns the URL of the next step in the voice cloning process.`
add comments 2021-04-20 12:23:11 +01:00
			`Parameters`
			`----------`
			`urls : dict`
			`Frontend url paths and names`
			`path : str`
			`Current URL`

			`Returns`
			`-------`
			`str`
			`URL of next step or '' if not found`
			`"""`
init 2021-03-10 15:56:40 +00:00			`urls = list(urls.keys())`
			`next_url_index = urls.index(path) + 1`
			`return urls[next_url_index] if next_url_index < len(urls) else ""`


Clip generation rework 2021-04-01 19:12:38 +01:00			`def get_suffix():`
Update comments 2021-05-05 21:13:58 +01:00			`"""`
			`Generates a filename suffix using the currrent datetime.`
add comments 2021-04-20 12:23:11 +01:00
			`Returns`
			`-------`
			`str`
			`String suffix`
			`"""`
init 2021-03-10 15:56:40 +00:00			`return datetime.now().strftime("%d-%m-%Y_%H-%M-%S")`
Add settings menu 2021-04-04 18:52:36 +01:00

			`def delete_folder(path):`
Update comments 2021-05-05 21:13:58 +01:00			`"""`
			`Deletes a folder.`
add comments 2021-04-20 12:23:11 +01:00
			`Parameters`
			`----------`
			`path : str`
			`Path to folder`

			`Raises`
			`-------`
			`AssertionError`
			`If folder is not found`
			`"""`
Add settings menu 2021-04-04 18:52:36 +01:00			`assert os.path.isdir(path), f"{path} does not exist"`
			`shutil.rmtree(path)`
Improve dataset import verification 2021-05-02 17:09:05 +01:00

			`def import_dataset(dataset, dataset_directory, audio_folder, logging):`
Update comments 2021-05-05 21:13:58 +01:00			`"""`
			`Imports a dataset zip into the app.`
			`Checks required files are present, saves the files,`
			`converts the audio to the required format and generates the info file.`
			`Deletes given zip regardless of success.`

			`Parameters`
			`----------`
			`dataset : str`
			`Path to dataset zip`
			`dataset_directory : str`
			`Destination path for the dataset`
			`audio_folder : str`
			`Destination path for the dataset audio`
			`logging : logging`
			`Logging object to write logs to`

			`Raises`
			`-------`
			`AssertionError`
			`If files are missing or invalid`
			`"""`
Improve dataset import verification 2021-05-02 17:09:05 +01:00			`try:`
			`with zipfile.ZipFile(dataset, mode="r") as z:`
			`files_list = z.namelist()`
Formatting 2021-05-02 17:09:23 +01:00			`assert (`
			`"metadata.csv" in files_list`
			`), "Dataset missing metadata.csv. Make sure this file is in the root of the zip file"`
Improve dataset import verification 2021-05-02 17:09:05 +01:00
			`folders = [x.split("/")[0] for x in files_list if "/" in x]`
Formatting 2021-05-02 17:09:23 +01:00			`assert (`
			`"wavs" in folders`
			`), "Dataset missing wavs folder. Make sure this folder is in the root of the zip file"`
Improve dataset import verification 2021-05-02 17:09:05 +01:00
			`wavs = [x for x in files_list if x.startswith("wavs/") and x.endswith(".wav")]`
			`assert wavs, "No wavs found in wavs folder"`

Change encoding to latin_1 2021-07-03 13:19:47 +01:00			`metadata = z.read("metadata.csv").decode(CHARACTER_ENCODING, "ignore").replace("\r\n", "\n")`
Fix unicode text files 2021-05-02 17:44:18 +01:00			`num_metadata_rows = len([row for row in metadata.split("\n") if row])`
Formatting 2021-05-02 17:09:23 +01:00			`assert (`
			`len(wavs) == num_metadata_rows`
			`), f"Number of wavs and labels do not match. metadata: {num_metadata_rows}, wavs: {len(wavs)}"`
Improve dataset import verification 2021-05-02 17:09:05 +01:00
			`logging.info("Creating directory")`
			`os.makedirs(dataset_directory, exist_ok=False)`
			`os.makedirs(audio_folder, exist_ok=False)`

			`# Save metadata`
			`logging.info("Saving files")`
Change encoding to latin_1 2021-07-03 13:19:47 +01:00			`with open(os.path.join(dataset_directory, "metadata.csv"), "w", encoding=CHARACTER_ENCODING) as f:`
Improve dataset import verification 2021-05-02 17:09:05 +01:00			`f.write(metadata)`

			`# Save wavs`
			`total_wavs = len(wavs)`
			`clip_lengths = []`
			`filenames = {}`
			`for i in range(total_wavs):`
			`wav = wavs[i]`
			`data = z.read(wav)`
			`path = os.path.join(dataset_directory, "wavs", wav.split("/")[1])`
			`with open(path, "wb") as f:`
			`f.write(data)`
			`new_path = convert_audio(path)`
Verify clip length in dataset import 2021-06-10 22:15:05 +01:00			`duration = librosa.get_duration(filename=new_path)`
			`clip_lengths.append(duration)`
Improve dataset import verification 2021-05-02 17:09:05 +01:00			`filenames[path] = new_path`
			`logging.info(f"Progress - {i+1}/{total_wavs}")`

Remove clip length check in dataset importing 2021-08-29 19:27:28 +01:00			`logging.info(f"Longest clip: {max(clip_lengths)}s, Shortest clip: {min(clip_lengths)}s")`
Improve dataset import verification 2021-05-02 17:09:05 +01:00			`# Get around "file in use" by using delay`
			`logging.info("Deleting temp files")`
			`for old_path, new_path in filenames.items():`
			`os.remove(old_path)`
			`os.rename(new_path, old_path)`

			`# Create info file`
			`logging.info("Creating info file")`
			`save_dataset_info(`
			`os.path.join(dataset_directory, "metadata.csv"),`
			`os.path.join(dataset_directory, "wavs"),`
			`os.path.join(dataset_directory, "info.json"),`
Formatting 2021-05-02 17:09:23 +01:00			`clip_lengths=clip_lengths,`
Improve dataset import verification 2021-05-02 17:09:05 +01:00			`)`
			`except Exception as e:`
			`os.remove(dataset)`
			`raise e`
Formatting 2021-05-02 17:09:23 +01:00
Improve dataset import verification 2021-05-02 17:09:05 +01:00			`os.remove(dataset)`