bark/api.py

from typing import Optional

import numpy as np

from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic


def text_to_semantic(
    text: str,
    history_prompt: Optional[str] = None,
    temp: float = 0.7,
    silent: bool = False,
):
    """Generate semantic array from text.

    Args:
        text: text to be turned into audio
        history_prompt: history choice for audio cloning
        temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar

    Returns:
        numpy semantic array to be fed into `semantic_to_waveform`
    """
    x_semantic = generate_text_semantic(
        text,
        history_prompt=history_prompt,
        temp=temp,
        silent=silent,
        use_kv_caching=True
    )
    return x_semantic


def semantic_to_waveform(
    semantic_tokens: np.ndarray,
    history_prompt: Optional[str] = None,
    temp: float = 0.7,
    silent: bool = False,
    output_full: bool = False,
):
    """Generate audio array from semantic input.

    Args:
        semantic_tokens: semantic token output from `text_to_semantic`
        history_prompt: history choice for audio cloning
        temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar
        output_full: return full generation to be used as a history prompt

    Returns:
        numpy audio array at sample frequency 24khz
    """
    coarse_tokens = generate_coarse(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=temp,
        silent=silent,
        use_kv_caching=True
    )
    fine_tokens = generate_fine(
        coarse_tokens,
        history_prompt=history_prompt,
        temp=0.5,
    )
    audio_arr = codec_decode(fine_tokens)
    if output_full:
        full_generation = {
            "semantic_prompt": semantic_tokens,
            "coarse_prompt": coarse_tokens,
            "fine_prompt": fine_tokens,
        }
        return full_generation, audio_arr
    return audio_arr


def save_as_prompt(filepath, full_generation):
    assert(filepath.endswith(".npz"))
    assert(isinstance(full_generation, dict))
    assert("semantic_prompt" in full_generation)
    assert("coarse_prompt" in full_generation)
    assert("fine_prompt" in full_generation)
    np.savez(filepath, **full_generation)


def generate_audio(
    text: str,
    history_prompt: Optional[str] = None,
    text_temp: float = 0.7,
    waveform_temp: float = 0.7,
    silent: bool = False,
    output_full: bool = False,
):
    """Generate audio array from input text.

    Args:
        text: text to be turned into audio
        history_prompt: history choice for audio cloning
        text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar
        output_full: return full generation to be used as a history prompt

    Returns:
        numpy audio array at sample frequency 24khz
    """
    semantic_tokens = text_to_semantic(
        text,
        history_prompt=history_prompt,
        temp=text_temp,
        silent=silent,
    )
    out = semantic_to_waveform(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=waveform_temp,
        silent=silent,
        output_full=output_full,
    )
    if output_full:
        full_generation, audio_arr = out
        return full_generation, audio_arr
    else:
        audio_arr = out
    return audio_arr
first commit 2023-04-09 13:21:02 -04:00			`from typing import Optional`

			`import numpy as np`

			`from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic`


			`def text_to_semantic(`
			`text: str,`
			`history_prompt: Optional[str] = None,`
			`temp: float = 0.7,`
small updates 2023-04-21 15:13:16 -04:00			`silent: bool = False,`
first commit 2023-04-09 13:21:02 -04:00			`):`
			`"""Generate semantic array from text.`

			`Args:`
			`text: text to be turned into audio`
			`history_prompt: history choice for audio cloning`
			`temp: generation temperature (1.0 more diverse, 0.0 more conservative)`
small updates 2023-04-21 15:13:16 -04:00			`silent: disable progress bar`
first commit 2023-04-09 13:21:02 -04:00
			`Returns:`
			numpy semantic array to be fed into `semantic_to_waveform`
			`"""`
			`x_semantic = generate_text_semantic(`
			`text,`
			`history_prompt=history_prompt,`
			`temp=temp,`
small updates 2023-04-21 15:13:16 -04:00			`silent=silent,`
make kv caching default in inference 2023-04-22 15:42:30 -04:00			`use_kv_caching=True`
first commit 2023-04-09 13:21:02 -04:00			`)`
			`return x_semantic`


			`def semantic_to_waveform(`
			`semantic_tokens: np.ndarray,`
			`history_prompt: Optional[str] = None,`
			`temp: float = 0.7,`
small updates 2023-04-21 15:13:16 -04:00			`silent: bool = False,`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`output_full: bool = False,`
first commit 2023-04-09 13:21:02 -04:00			`):`
			`"""Generate audio array from semantic input.`

			`Args:`
			semantic_tokens: semantic token output from `text_to_semantic`
			`history_prompt: history choice for audio cloning`
			`temp: generation temperature (1.0 more diverse, 0.0 more conservative)`
small updates 2023-04-21 15:13:16 -04:00			`silent: disable progress bar`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`output_full: return full generation to be used as a history prompt`
first commit 2023-04-09 13:21:02 -04:00
			`Returns:`
			`numpy audio array at sample frequency 24khz`
			`"""`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`coarse_tokens = generate_coarse(`
first commit 2023-04-09 13:21:02 -04:00			`semantic_tokens,`
			`history_prompt=history_prompt,`
			`temp=temp,`
small updates 2023-04-21 15:13:16 -04:00			`silent=silent,`
make kv caching default in inference 2023-04-22 15:42:30 -04:00			`use_kv_caching=True`
first commit 2023-04-09 13:21:02 -04:00			`)`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`fine_tokens = generate_fine(`
			`coarse_tokens,`
first commit 2023-04-09 13:21:02 -04:00			`history_prompt=history_prompt,`
			`temp=0.5,`
			`)`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`audio_arr = codec_decode(fine_tokens)`
			`if output_full:`
			`full_generation = {`
			`"semantic_prompt": semantic_tokens,`
			`"coarse_prompt": coarse_tokens,`
			`"fine_prompt": fine_tokens,`
			`}`
			`return full_generation, audio_arr`
first commit 2023-04-09 13:21:02 -04:00			`return audio_arr`


allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`def save_as_prompt(filepath, full_generation):`
			`assert(filepath.endswith(".npz"))`
			`assert(isinstance(full_generation, dict))`
			`assert("semantic_prompt" in full_generation)`
			`assert("coarse_prompt" in full_generation)`
			`assert("fine_prompt" in full_generation)`
			`np.savez(filepath, **full_generation)`


first commit 2023-04-09 13:21:02 -04:00			`def generate_audio(`
			`text: str,`
			`history_prompt: Optional[str] = None,`
			`text_temp: float = 0.7,`
			`waveform_temp: float = 0.7,`
small updates 2023-04-21 15:13:16 -04:00			`silent: bool = False,`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`output_full: bool = False,`
first commit 2023-04-09 13:21:02 -04:00			`):`
			`"""Generate audio array from input text.`

			`Args:`
			`text: text to be turned into audio`
			`history_prompt: history choice for audio cloning`
			`text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)`
			`waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)`
small updates 2023-04-21 15:13:16 -04:00			`silent: disable progress bar`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`output_full: return full generation to be used as a history prompt`
first commit 2023-04-09 13:21:02 -04:00
			`Returns:`
			`numpy audio array at sample frequency 24khz`
			`"""`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`semantic_tokens = text_to_semantic(`
make kv caching default in inference 2023-04-22 15:42:30 -04:00			`text,`
			`history_prompt=history_prompt,`
			`temp=text_temp,`
			`silent=silent,`
small updates 2023-04-21 15:13:16 -04:00			`)`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`out = semantic_to_waveform(`
			`semantic_tokens,`
			`history_prompt=history_prompt,`
			`temp=waveform_temp,`
			`silent=silent,`
			`output_full=output_full,`
small updates 2023-04-21 15:13:16 -04:00			`)`
allow using unconditional as prompts 2023-04-21 16:14:10 -04:00			`if output_full:`
			`full_generation, audio_arr = out`
			`return full_generation, audio_arr`
			`else:`
			`audio_arr = out`
first commit 2023-04-09 13:21:02 -04:00			`return audio_arr`