first commit

2025-12-16 11:48:09 +01:00 · 2023-04-09 13:21:02 -04:00
commit ea9a687004
18 changed files with 1769 additions and 0 deletions
--- a/bark/api.py
+++ b/bark/api.py
@@ -0,0 +1,79 @@
+from typing import Optional
+
+import numpy as np
+
+from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic
+
+
+def text_to_semantic(
+    text: str,
+    history_prompt: Optional[str] = None,
+    temp: float = 0.7,
+):
+    """Generate semantic array from text.
+
+    Args:
+        text: text to be turned into audio
+        history_prompt: history choice for audio cloning
+        temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+
+    Returns:
+        numpy semantic array to be fed into `semantic_to_waveform`
+    """
+    x_semantic = generate_text_semantic(
+        text,
+        history_prompt=history_prompt,
+        temp=temp,
+    )
+    return x_semantic
+
+
+def semantic_to_waveform(
+    semantic_tokens: np.ndarray,
+    history_prompt: Optional[str] = None,
+    temp: float = 0.7,
+):
+    """Generate audio array from semantic input.
+
+    Args:
+        semantic_tokens: semantic token output from `text_to_semantic`
+        history_prompt: history choice for audio cloning
+        temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+
+    Returns:
+        numpy audio array at sample frequency 24khz
+    """
+    x_coarse_gen = generate_coarse(
+        semantic_tokens,
+        history_prompt=history_prompt,
+        temp=temp,
+    )
+    x_fine_gen = generate_fine(
+        x_coarse_gen,
+        history_prompt=history_prompt,
+        temp=0.5,
+    )
+    audio_arr = codec_decode(x_fine_gen)
+    return audio_arr
+
+
+def generate_audio(
+    text: str,
+    history_prompt: Optional[str] = None,
+    text_temp: float = 0.7,
+    waveform_temp: float = 0.7,
+):
+    """Generate audio array from input text.
+
+    Args:
+        text: text to be turned into audio
+        history_prompt: history choice for audio cloning
+        text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+        waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+
+    Returns:
+        numpy audio array at sample frequency 24khz
+    """
+    x_semantic = text_to_semantic(text, history_prompt=history_prompt, temp=text_temp)
+    audio_arr = semantic_to_waveform(x_semantic, history_prompt=history_prompt, temp=waveform_temp)
+    return audio_arr