Files
voice-cloning-collab/encoder/data_objects/utterance.py
2023-10-27 15:03:22 +08:00

29 lines
985 B
Python

import numpy as np
class Utterance:
def __init__(self, frames_fpath, wave_fpath):
self.frames_fpath = frames_fpath
self.wave_fpath = wave_fpath
def get_frames(self):
# frame_len = len(np.load(self.frames_fpath))
return np.load(self.frames_fpath)
def random_partial(self, n_frames):
"""
Crops the frames into a partial utterance of n_frames
:param n_frames: The number of frames of the partial utterance
:return: the partial utterance frames and a tuple indicating the start and end of the
partial utterance in the complete utterance.
"""
frames = self.get_frames()
if frames.shape[0] == n_frames:
start = 0
else:
start = np.random.randint(0, frames.shape[0] - n_frames)
end = start + n_frames
# frame_len = end - start
# frames_trim = frames[start:end]
return frames[start:end], (start, end)