2023-04-09 13:21:02 -04:00
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def text_to_semantic(
|
|
|
|
|
text: str,
|
|
|
|
|
history_prompt: Optional[str] = None,
|
|
|
|
|
temp: float = 0.7,
|
2023-04-21 15:13:16 -04:00
|
|
|
silent: bool = False,
|
2023-04-09 13:21:02 -04:00
|
|
|
):
|
|
|
|
|
"""Generate semantic array from text.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
text: text to be turned into audio
|
|
|
|
|
history_prompt: history choice for audio cloning
|
|
|
|
|
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
|
2023-04-21 15:13:16 -04:00
|
|
|
silent: disable progress bar
|
2023-04-09 13:21:02 -04:00
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
numpy semantic array to be fed into `semantic_to_waveform`
|
|
|
|
|
"""
|
|
|
|
|
x_semantic = generate_text_semantic(
|
|
|
|
|
text,
|
|
|
|
|
history_prompt=history_prompt,
|
|
|
|
|
temp=temp,
|
2023-04-21 15:13:16 -04:00
|
|
|
silent=silent,
|
2023-04-22 15:42:30 -04:00
|
|
|
use_kv_caching=True
|
2023-04-09 13:21:02 -04:00
|
|
|
)
|
|
|
|
|
return x_semantic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def semantic_to_waveform(
|
|
|
|
|
semantic_tokens: np.ndarray,
|
|
|
|
|
history_prompt: Optional[str] = None,
|
|
|
|
|
temp: float = 0.7,
|
2023-04-21 15:13:16 -04:00
|
|
|
silent: bool = False,
|
2023-04-21 16:14:10 -04:00
|
|
|
output_full: bool = False,
|
2023-04-09 13:21:02 -04:00
|
|
|
):
|
|
|
|
|
"""Generate audio array from semantic input.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
semantic_tokens: semantic token output from `text_to_semantic`
|
|
|
|
|
history_prompt: history choice for audio cloning
|
|
|
|
|
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
|
2023-04-21 15:13:16 -04:00
|
|
|
silent: disable progress bar
|
2023-04-21 16:14:10 -04:00
|
|
|
output_full: return full generation to be used as a history prompt
|
2023-04-09 13:21:02 -04:00
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
numpy audio array at sample frequency 24khz
|
|
|
|
|
"""
|
2023-04-21 16:14:10 -04:00
|
|
|
coarse_tokens = generate_coarse(
|
2023-04-09 13:21:02 -04:00
|
|
|
semantic_tokens,
|
|
|
|
|
history_prompt=history_prompt,
|
|
|
|
|
temp=temp,
|
2023-04-21 15:13:16 -04:00
|
|
|
silent=silent,
|
2023-04-22 15:42:30 -04:00
|
|
|
use_kv_caching=True
|
2023-04-09 13:21:02 -04:00
|
|
|
)
|
2023-04-21 16:14:10 -04:00
|
|
|
fine_tokens = generate_fine(
|
|
|
|
|
coarse_tokens,
|
2023-04-09 13:21:02 -04:00
|
|
|
history_prompt=history_prompt,
|
|
|
|
|
temp=0.5,
|
|
|
|
|
)
|
2023-04-21 16:14:10 -04:00
|
|
|
audio_arr = codec_decode(fine_tokens)
|
|
|
|
|
if output_full:
|
|
|
|
|
full_generation = {
|
|
|
|
|
"semantic_prompt": semantic_tokens,
|
|
|
|
|
"coarse_prompt": coarse_tokens,
|
|
|
|
|
"fine_prompt": fine_tokens,
|
|
|
|
|
}
|
|
|
|
|
return full_generation, audio_arr
|
2023-04-09 13:21:02 -04:00
|
|
|
return audio_arr
|
|
|
|
|
|
|
|
|
|
|
2023-04-21 16:14:10 -04:00
|
|
|
def save_as_prompt(filepath, full_generation):
|
|
|
|
|
assert(filepath.endswith(".npz"))
|
|
|
|
|
assert(isinstance(full_generation, dict))
|
|
|
|
|
assert("semantic_prompt" in full_generation)
|
|
|
|
|
assert("coarse_prompt" in full_generation)
|
|
|
|
|
assert("fine_prompt" in full_generation)
|
|
|
|
|
np.savez(filepath, **full_generation)
|
|
|
|
|
|
|
|
|
|
|
2023-04-09 13:21:02 -04:00
|
|
|
def generate_audio(
|
|
|
|
|
text: str,
|
|
|
|
|
history_prompt: Optional[str] = None,
|
|
|
|
|
text_temp: float = 0.7,
|
|
|
|
|
waveform_temp: float = 0.7,
|
2023-04-21 15:13:16 -04:00
|
|
|
silent: bool = False,
|
2023-04-21 16:14:10 -04:00
|
|
|
output_full: bool = False,
|
2023-04-09 13:21:02 -04:00
|
|
|
):
|
|
|
|
|
"""Generate audio array from input text.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
text: text to be turned into audio
|
|
|
|
|
history_prompt: history choice for audio cloning
|
|
|
|
|
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
|
|
|
|
|
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
|
2023-04-21 15:13:16 -04:00
|
|
|
silent: disable progress bar
|
2023-04-21 16:14:10 -04:00
|
|
|
output_full: return full generation to be used as a history prompt
|
2023-04-09 13:21:02 -04:00
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
numpy audio array at sample frequency 24khz
|
|
|
|
|
"""
|
2023-04-21 16:14:10 -04:00
|
|
|
semantic_tokens = text_to_semantic(
|
2023-04-22 15:42:30 -04:00
|
|
|
text,
|
|
|
|
|
history_prompt=history_prompt,
|
|
|
|
|
temp=text_temp,
|
|
|
|
|
silent=silent,
|
2023-04-21 15:13:16 -04:00
|
|
|
)
|
2023-04-21 16:14:10 -04:00
|
|
|
out = semantic_to_waveform(
|
|
|
|
|
semantic_tokens,
|
|
|
|
|
history_prompt=history_prompt,
|
|
|
|
|
temp=waveform_temp,
|
|
|
|
|
silent=silent,
|
|
|
|
|
output_full=output_full,
|
2023-04-21 15:13:16 -04:00
|
|
|
)
|
2023-04-21 16:14:10 -04:00
|
|
|
if output_full:
|
|
|
|
|
full_generation, audio_arr = out
|
|
|
|
|
return full_generation, audio_arr
|
|
|
|
|
else:
|
|
|
|
|
audio_arr = out
|
2023-04-09 13:21:02 -04:00
|
|
|
return audio_arr
|