from typing import Optional import numpy as np from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic def text_to_semantic( text: str, history_prompt: Optional[str] = None, temp: float = 0.7, ): """Generate semantic array from text. Args: text: text to be turned into audio history_prompt: history choice for audio cloning temp: generation temperature (1.0 more diverse, 0.0 more conservative) Returns: numpy semantic array to be fed into `semantic_to_waveform` """ x_semantic = generate_text_semantic( text, history_prompt=history_prompt, temp=temp, ) return x_semantic def semantic_to_waveform( semantic_tokens: np.ndarray, history_prompt: Optional[str] = None, temp: float = 0.7, ): """Generate audio array from semantic input. Args: semantic_tokens: semantic token output from `text_to_semantic` history_prompt: history choice for audio cloning temp: generation temperature (1.0 more diverse, 0.0 more conservative) Returns: numpy audio array at sample frequency 24khz """ x_coarse_gen = generate_coarse( semantic_tokens, history_prompt=history_prompt, temp=temp, ) x_fine_gen = generate_fine( x_coarse_gen, history_prompt=history_prompt, temp=0.5, ) audio_arr = codec_decode(x_fine_gen) return audio_arr def generate_audio( text: str, history_prompt: Optional[str] = None, text_temp: float = 0.7, waveform_temp: float = 0.7, ): """Generate audio array from input text. Args: text: text to be turned into audio history_prompt: history choice for audio cloning text_temp: generation temperature (1.0 more diverse, 0.0 more conservative) waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative) Returns: numpy audio array at sample frequency 24khz """ x_semantic = text_to_semantic(text, history_prompt=history_prompt, temp=text_temp) audio_arr = semantic_to_waveform(x_semantic, history_prompt=history_prompt, temp=waveform_temp) return audio_arr