From 6c26fb7b3463334ae9fb4d63dae52f3c29506db0 Mon Sep 17 00:00:00 2001
From: Georg Kucsko
Date: Tue, 25 Apr 2023 17:49:35 -0400
Subject: [PATCH 01/13] simplify device placement
---
bark/generation.py | 123 ++++++++++++++++++++++++++++++---------------
bark/model.py | 1 -
2 files changed, 82 insertions(+), 42 deletions(-)
diff --git a/bark/generation.py b/bark/generation.py
index 4aa805e..28d963c 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -83,6 +83,7 @@ CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno",
USE_SMALL_MODELS = os.environ.get("SUNO_USE_SMALL_MODELS", False)
+GLOBAL_ENABLE_MPS = os.environ.get("SUNO_ENABLE_MPS", False)
REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
@@ -114,10 +115,10 @@ REMOTE_MODEL_PATHS = {
}
-if not hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
+if not hasattr(torch.nn.functional, 'scaled_dot_product_attention') and torch.cuda.is_available():
logger.warning(
- "torch version does not support flash attention. You will get significantly faster" +
- " inference speed by upgrade torch to newest version / nightly."
+ "torch version does not support flash attention. You will get faster" +
+ " inference speed by upgrade torch to newest nightly version."
)
@@ -141,6 +142,16 @@ def _get_ckpt_path(model_type, use_small=False):
return os.path.join(CACHE_DIR, f"{model_name}.pt")
+def _grab_best_device(use_gpu=True):
+ if torch.cuda.device_count() > 0 and use_gpu:
+ device = "cuda"
+ elif torch.backends.mps.is_available() and use_gpu and GLOBAL_ENABLE_MPS:
+ device = "mps"
+ else:
+ device = "cpu"
+ return device
+
+
S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
@@ -207,8 +218,6 @@ def clean_models(model_key=None):
def _load_model(ckpt_path, device, use_small=False, model_type="text"):
- if "cuda" not in device:
- logger.warning("No GPU being used. Careful, inference might be extremely slow!")
if model_type == "text":
ConfigClass = GPTConfig
ModelClass = GPT
@@ -285,30 +294,32 @@ def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="te
if model_type not in ("text", "coarse", "fine"):
raise NotImplementedError()
global models
- if torch.cuda.device_count() == 0 or not use_gpu:
- device = "cpu"
- else:
- device = "cuda"
- model_key = str(device) + f"__{model_type}"
+ device = _grab_best_device(use_gpu=use_gpu)
+ model_key = f"{model_type}"
if model_key not in models or force_reload:
ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
clean_models(model_key=model_key)
model = _load_model_f(ckpt_path, device)
models[model_key] = model
+ if model_type == "text":
+ models[model_key]["model"].to(device)
+ else:
+ models[model_key].to(device)
return models[model_key]
def load_codec_model(use_gpu=True, force_reload=False):
global models
- if torch.cuda.device_count() == 0 or not use_gpu:
+ device = _grab_best_device(use_gpu=use_gpu)
+ if device == "mps":
+ # encodec doesn't support mps
device = "cpu"
- else:
- device = "cuda"
- model_key = str(device) + f"__codec"
+ model_key = "codec"
if model_key not in models or force_reload:
clean_models(model_key=model_key)
model = _load_codec_model(device)
models[model_key] = model
+ models[model_key].to(device)
return models[model_key]
@@ -322,6 +333,11 @@ def preload_models(
codec_use_gpu=True,
force_reload=False,
):
+ """Load all the necessary models for the pipeline."""
+ if _grab_best_device() == "cpu" and (
+ text_use_gpu or coarse_use_gpu or fine_use_gpu or codec_use_gpu
+ ):
+ logger.warning("No GPU being used. Careful, inference might be very slow!")
_ = load_model(
model_type="text", use_gpu=text_use_gpu, use_small=text_use_small, force_reload=force_reload
)
@@ -366,13 +382,11 @@ def generate_text_semantic(
temp=0.7,
top_k=None,
top_p=None,
- use_gpu=True,
silent=False,
min_eos_p=0.2,
max_gen_duration_s=None,
allow_early_stop=True,
- model=None,
- use_kv_caching=False
+ use_kv_caching=False,
):
"""Generate semantic tokens from text."""
assert isinstance(text, str)
@@ -395,12 +409,15 @@ def generate_text_semantic(
)
else:
semantic_history = None
- model_container = load_model(use_gpu=use_gpu, model_type="text")
- if model is None:
- model = model_container["model"]
+ # load models if not yet exist
+ global models
+ if "text" not in models:
+ preload_models()
+ model_container = models["text"]
+ model = model_container["model"]
tokenizer = model_container["tokenizer"]
encoded_text = np.array(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
- device = "cuda" if use_gpu and torch.cuda.device_count() > 0 else "cpu"
+ device = next(model.parameters()).device
if len(encoded_text) > 256:
p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
logger.warning(f"warning, text too long, lopping of last {p}%")
@@ -424,7 +441,9 @@ def generate_text_semantic(
else:
semantic_history = np.array([SEMANTIC_PAD_TOKEN] * 256)
x = torch.from_numpy(
- np.hstack([encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])]).astype(np.int64)
+ np.hstack([
+ encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])
+ ]).astype(np.int64)
)[None]
assert x.shape[1] == 256 + 256 + 1
with _inference_mode():
@@ -440,8 +459,9 @@ def generate_text_semantic(
x_input = x[:, [-1]]
else:
x_input = x
-
- logits, kv_cache = model(x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache)
+ logits, kv_cache = model(
+ x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+ )
relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE]
if allow_early_stop:
relevant_logits = torch.hstack(
@@ -465,7 +485,13 @@ def generate_text_semantic(
v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
relevant_logits[relevant_logits < v[-1]] = -float("Inf")
probs = F.softmax(relevant_logits / temp, dim=-1)
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ inf_device = probs.device
+ if probs.device.type == "mps":
+ probs = probs.to("cpu")
item_next = torch.multinomial(probs, num_samples=1)
+ probs = probs.to(inf_device)
+ item_next = item_next.to(inf_device)
if allow_early_stop and (
item_next == SEMANTIC_VOCAB_SIZE
or (min_eos_p is not None and probs[-1] >= min_eos_p)
@@ -513,12 +539,10 @@ def generate_coarse(
temp=0.7,
top_k=None,
top_p=None,
- use_gpu=True,
silent=False,
max_coarse_history=630, # min 60 (faster), max 630 (more context)
sliding_window_len=60,
- model=None,
- use_kv_caching=False
+ use_kv_caching=False,
):
"""Generate coarse audio codes from semantic tokens."""
assert (
@@ -576,9 +600,12 @@ def generate_coarse(
else:
x_semantic_history = np.array([], dtype=np.int32)
x_coarse_history = np.array([], dtype=np.int32)
- if model is None:
- model = load_model(use_gpu=use_gpu, model_type="coarse")
- device = "cuda" if use_gpu and torch.cuda.device_count() > 0 else "cpu"
+ # load models if not yet exist
+ global models
+ if "coarse" not in models:
+ preload_models()
+ model = models["coarse"]
+ device = next(model.parameters()).device
# start loop
n_steps = int(
round(
@@ -650,7 +677,13 @@ def generate_coarse(
v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
relevant_logits[relevant_logits < v[-1]] = -float("Inf")
probs = F.softmax(relevant_logits / temp, dim=-1)
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ inf_device = probs.device
+ if probs.device.type == "mps":
+ probs = probs.to("cpu")
item_next = torch.multinomial(probs, num_samples=1)
+ probs = probs.to(inf_device)
+ item_next = item_next.to(inf_device)
item_next += logit_start_idx
x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
x_in = torch.cat((x_in, item_next[None]), dim=1)
@@ -672,9 +705,7 @@ def generate_fine(
x_coarse_gen,
history_prompt=None,
temp=0.5,
- use_gpu=True,
silent=True,
- model=None,
):
"""Generate full audio codes from coarse audio codes."""
assert (
@@ -704,9 +735,12 @@ def generate_fine(
else:
x_fine_history = None
n_coarse = x_coarse_gen.shape[0]
- if model is None:
- model = load_model(use_gpu=use_gpu, model_type="fine")
- device = "cuda" if use_gpu and torch.cuda.device_count() > 0 else "cpu"
+ # load models if not yet exist
+ global models
+ if "fine" not in models:
+ preload_models()
+ model = models["fine"]
+ device = next(model.parameters()).device
# make input arr
in_arr = np.vstack(
[
@@ -754,10 +788,14 @@ def generate_fine(
else:
relevant_logits = logits[0, :, :CODEBOOK_SIZE] / temp
probs = F.softmax(relevant_logits, dim=-1)
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ inf_device = probs.device
+ if probs.device.type == "mps":
+ probs = probs.to("cpu")
codebook_preds = torch.hstack(
[
- torch.multinomial(probs[n], num_samples=1)
- for n in range(rel_start_fill_idx, 1024)
+ torch.multinomial(probs[nnn], num_samples=1).to(inf_device)
+ for nnn in range(rel_start_fill_idx, 1024)
]
)
in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
@@ -778,11 +816,14 @@ def generate_fine(
return gen_fine_arr
-def codec_decode(fine_tokens, model=None, use_gpu=True):
+def codec_decode(fine_tokens):
"""Turn quantized audio codes into audio array using encodec."""
- if model is None:
- model = load_codec_model(use_gpu=use_gpu)
- device = "cuda" if use_gpu and torch.cuda.device_count() > 0 else "cpu"
+ # load models if not yet exist
+ global models
+ if "codec" not in models:
+ preload_models()
+ model = models["codec"]
+ device = next(model.parameters()).device
arr = torch.from_numpy(fine_tokens)[None]
arr = arr.to(device)
arr = arr.transpose(0, 1)
diff --git a/bark/model.py b/bark/model.py
index bb99932..457b49e 100644
--- a/bark/model.py
+++ b/bark/model.py
@@ -200,7 +200,6 @@ class GPT(nn.Module):
pos_emb = self.transformer.wpe(position_ids) # position embeddings of shape (1, t, n_embd)
-
x = self.transformer.drop(tok_emb + pos_emb)
new_kv = () if use_cache else None
From 8675c23a4246794db0636ee1f5a2b27321cdd918 Mon Sep 17 00:00:00 2001
From: Jairo Correa
Date: Tue, 25 Apr 2023 21:21:52 -0300
Subject: [PATCH 02/13] Option to offload to cpu
---
bark/generation.py | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/bark/generation.py b/bark/generation.py
index 28d963c..ec313e7 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -36,6 +36,9 @@ else:
global models
models = {}
+global models_devices
+models_devices = {}
+
CONTEXT_WINDOW_SIZE = 1024
@@ -84,6 +87,7 @@ CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno",
USE_SMALL_MODELS = os.environ.get("SUNO_USE_SMALL_MODELS", False)
GLOBAL_ENABLE_MPS = os.environ.get("SUNO_ENABLE_MPS", False)
+OFFLOAD_CPU = os.environ.get("SUNO_OFFLOAD_CPU", False)
REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
@@ -296,6 +300,9 @@ def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="te
global models
device = _grab_best_device(use_gpu=use_gpu)
model_key = f"{model_type}"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
if model_key not in models or force_reload:
ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
clean_models(model_key=model_key)
@@ -315,6 +322,9 @@ def load_codec_model(use_gpu=True, force_reload=False):
# encodec doesn't support mps
device = "cpu"
model_key = "codec"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
if model_key not in models or force_reload:
clean_models(model_key=model_key)
model = _load_codec_model(device)
@@ -417,6 +427,8 @@ def generate_text_semantic(
model = model_container["model"]
tokenizer = model_container["tokenizer"]
encoded_text = np.array(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
+ if OFFLOAD_CPU:
+ model.to(models_devices["text"])
device = next(model.parameters()).device
if len(encoded_text) > 256:
p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
@@ -514,6 +526,8 @@ def generate_text_semantic(
pbar_state = req_pbar_state
pbar.close()
out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+ if OFFLOAD_CPU:
+ model.to("cpu")
assert all(0 <= out) and all(out < SEMANTIC_VOCAB_SIZE)
_clear_cuda_cache()
return out
@@ -605,6 +619,8 @@ def generate_coarse(
if "coarse" not in models:
preload_models()
model = models["coarse"]
+ if OFFLOAD_CPU:
+ model.to(models_devices["coarse"])
device = next(model.parameters()).device
# start loop
n_steps = int(
@@ -691,6 +707,8 @@ def generate_coarse(
n_step += 1
del x_in
del x_semantic_in
+ if OFFLOAD_CPU:
+ model.to("cpu")
gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
del x_coarse_in
assert len(gen_coarse_arr) == n_steps
@@ -740,6 +758,8 @@ def generate_fine(
if "fine" not in models:
preload_models()
model = models["fine"]
+ if OFFLOAD_CPU:
+ model.to(models_devices["fine"])
device = next(model.parameters()).device
# make input arr
in_arr = np.vstack(
@@ -808,6 +828,8 @@ def generate_fine(
del in_buffer
gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
del in_arr
+ if OFFLOAD_CPU:
+ model.to("cpu")
gen_fine_arr = gen_fine_arr[:, n_history:]
if n_remove_from_end > 0:
gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
@@ -823,6 +845,8 @@ def codec_decode(fine_tokens):
if "codec" not in models:
preload_models()
model = models["codec"]
+ if OFFLOAD_CPU:
+ model.to(models_devices["codec"])
device = next(model.parameters()).device
arr = torch.from_numpy(fine_tokens)[None]
arr = arr.to(device)
@@ -831,4 +855,6 @@ def codec_decode(fine_tokens):
out = model.decoder(emb)
audio_arr = out.detach().cpu().numpy().squeeze()
del arr, emb, out
+ if OFFLOAD_CPU:
+ model.to("cpu")
return audio_arr
From dfbe09f00e601168c640b28aac4c5f4a5e782153 Mon Sep 17 00:00:00 2001
From: Jairo Correa
Date: Tue, 25 Apr 2023 22:42:21 -0300
Subject: [PATCH 03/13] Add missing global models_devices
---
bark/generation.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/bark/generation.py b/bark/generation.py
index ec313e7..4ac165c 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -298,6 +298,7 @@ def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="te
if model_type not in ("text", "coarse", "fine"):
raise NotImplementedError()
global models
+ global models_devices
device = _grab_best_device(use_gpu=use_gpu)
model_key = f"{model_type}"
if OFFLOAD_CPU:
@@ -317,6 +318,7 @@ def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="te
def load_codec_model(use_gpu=True, force_reload=False):
global models
+ global models_devices
device = _grab_best_device(use_gpu=use_gpu)
if device == "mps":
# encodec doesn't support mps
@@ -421,6 +423,7 @@ def generate_text_semantic(
semantic_history = None
# load models if not yet exist
global models
+ global models_devices
if "text" not in models:
preload_models()
model_container = models["text"]
@@ -616,6 +619,7 @@ def generate_coarse(
x_coarse_history = np.array([], dtype=np.int32)
# load models if not yet exist
global models
+ global models_devices
if "coarse" not in models:
preload_models()
model = models["coarse"]
@@ -755,6 +759,7 @@ def generate_fine(
n_coarse = x_coarse_gen.shape[0]
# load models if not yet exist
global models
+ global models_devices
if "fine" not in models:
preload_models()
model = models["fine"]
@@ -842,6 +847,7 @@ def codec_decode(fine_tokens):
"""Turn quantized audio codes into audio array using encodec."""
# load models if not yet exist
global models
+ global models_devices
if "codec" not in models:
preload_models()
model = models["codec"]
From e9ad2d5886388c6188eced86c96331b6a6983984 Mon Sep 17 00:00:00 2001
From: vaibhavs10
Date: Thu, 27 Apr 2023 16:12:54 +0200
Subject: [PATCH 04/13] initial commit
---
bark/generation.py | 80 ++++++++++++++++++++++++++++++++++------------
1 file changed, 59 insertions(+), 21 deletions(-)
diff --git a/bark/generation.py b/bark/generation.py
index 4ac165c..3b12757 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -14,6 +14,7 @@ import torch
import torch.nn.functional as F
import tqdm
from transformers import BertTokenizer
+from huggingface_hub import hf_hub_download
from .model import GPTConfig, GPT
from .model_fine import FineGPT, FineGPTConfig
@@ -89,31 +90,64 @@ USE_SMALL_MODELS = os.environ.get("SUNO_USE_SMALL_MODELS", False)
GLOBAL_ENABLE_MPS = os.environ.get("SUNO_ENABLE_MPS", False)
OFFLOAD_CPU = os.environ.get("SUNO_OFFLOAD_CPU", False)
-REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
+# REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
+
+# REMOTE_MODEL_PATHS = {
+# "text_small": {
+# "path": os.path.join(REMOTE_BASE_URL, "text.pt"),
+# "checksum": "b3e42bcbab23b688355cd44128c4cdd3",
+# },
+# "coarse_small": {
+# "path": os.path.join(REMOTE_BASE_URL, "coarse.pt"),
+# "checksum": "5fe964825e3b0321f9d5f3857b89194d",
+# },
+# "fine_small": {
+# "path": os.path.join(REMOTE_BASE_URL, "fine.pt"),
+# "checksum": "5428d1befe05be2ba32195496e58dc90",
+# },
+# "text": {
+# "path": os.path.join(REMOTE_BASE_URL, "text_2.pt"),
+# "checksum": "54afa89d65e318d4f5f80e8e8799026a",
+# },
+# "coarse": {
+# "path": os.path.join(REMOTE_BASE_URL, "coarse_2.pt"),
+# "checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
+# },
+# "fine": {
+# "path": os.path.join(REMOTE_BASE_URL, "fine_2.pt"),
+# "checksum": "59d184ed44e3650774a2f0503a48a97b",
+# },
+# }
REMOTE_MODEL_PATHS = {
"text_small": {
- "path": os.path.join(REMOTE_BASE_URL, "text.pt"),
+ "repo_id": "reach-vb/bark-small",
+ "file_name": "text.pt",
"checksum": "b3e42bcbab23b688355cd44128c4cdd3",
},
"coarse_small": {
- "path": os.path.join(REMOTE_BASE_URL, "coarse.pt"),
+ "repo_id": "reach-vb/bark-small",
+ "file_name": "coarse.pt",
"checksum": "5fe964825e3b0321f9d5f3857b89194d",
},
"fine_small": {
- "path": os.path.join(REMOTE_BASE_URL, "fine.pt"),
+ "repo_id": "reach-vb/bark-small",
+ "file_name": "fine.pt",
"checksum": "5428d1befe05be2ba32195496e58dc90",
},
"text": {
- "path": os.path.join(REMOTE_BASE_URL, "text_2.pt"),
+ "repo_id": "reach-vb/bark",
+ "file_name": "text_2.pt",
"checksum": "54afa89d65e318d4f5f80e8e8799026a",
},
"coarse": {
- "path": os.path.join(REMOTE_BASE_URL, "coarse_2.pt"),
+ "repo_id": "reach-vb/bark",
+ "file_name": "coarse_2.pt",
"checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
},
"fine": {
- "path": os.path.join(REMOTE_BASE_URL, "fine_2.pt"),
+ "repo_id": "reach-vb/bark-small",
+ "file_name": "fine_2.pt",
"checksum": "59d184ed44e3650774a2f0503a48a97b",
},
}
@@ -165,21 +199,25 @@ def _parse_s3_filepath(s3_filepath):
return bucket_name, rel_s3_filepath
-def _download(from_s3_path, to_local_path):
- os.makedirs(CACHE_DIR, exist_ok=True)
- response = requests.get(from_s3_path, stream=True)
- total_size_in_bytes = int(response.headers.get("content-length", 0))
- block_size = 1024
- progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
- with open(to_local_path, "wb") as file:
- for data in response.iter_content(block_size):
- progress_bar.update(len(data))
- file.write(data)
- progress_bar.close()
- if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
- raise ValueError("ERROR, something went wrong")
+# def _download(from_s3_path, to_local_path):
+# os.makedirs(CACHE_DIR, exist_ok=True)
+# response = requests.get(from_s3_path, stream=True)
+# total_size_in_bytes = int(response.headers.get("content-length", 0))
+# block_size = 1024
+# progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+# with open(to_local_path, "wb") as file:
+# for data in response.iter_content(block_size):
+# progress_bar.update(len(data))
+# file.write(data)
+# progress_bar.close()
+# if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+# raise ValueError("ERROR, something went wrong")
+def _download(from_hf_path, file_name, to_local_path):
+ os.makedirs(CACHE_DIR, exist_ok=True)
+ hf_hub_download(repo_id=from_hf_path, filename=file_name, cache_dir=to_local_path)
+
class InferenceContext:
def __init__(self, benchmark=False):
# we can't expect inputs to be the same length, so disable benchmarking by default
@@ -243,7 +281,7 @@ def _load_model(ckpt_path, device, use_small=False, model_type="text"):
os.remove(ckpt_path)
if not os.path.exists(ckpt_path):
logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
- _download(model_info["path"], ckpt_path)
+ _download(model_info["repo_id"], model_info["file_name"], ckpt_path)
checkpoint = torch.load(ckpt_path, map_location=device)
# this is a hack
model_args = checkpoint["model_args"]
From ac3a7568a7e34e2f90e98b9ee6a31425fc9fe66f Mon Sep 17 00:00:00 2001
From: vaibhavs10
Date: Thu, 27 Apr 2023 16:19:58 +0200
Subject: [PATCH 05/13] up
---
bark/generation.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/bark/generation.py b/bark/generation.py
index 3b12757..a3c8a0b 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -90,7 +90,7 @@ USE_SMALL_MODELS = os.environ.get("SUNO_USE_SMALL_MODELS", False)
GLOBAL_ENABLE_MPS = os.environ.get("SUNO_ENABLE_MPS", False)
OFFLOAD_CPU = os.environ.get("SUNO_OFFLOAD_CPU", False)
-# REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
+REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
# REMOTE_MODEL_PATHS = {
# "text_small": {
@@ -176,7 +176,7 @@ def _md5(fname):
def _get_ckpt_path(model_type, use_small=False):
model_key = f"{model_type}_small" if use_small or USE_SMALL_MODELS else model_type
- model_name = _string_md5(REMOTE_MODEL_PATHS[model_key]["path"])
+ model_name = _string_md5(REMOTE_MODEL_PATHS[model_key]["file_name"])
return os.path.join(CACHE_DIR, f"{model_name}.pt")
From c26a82a4153fe05486aa593498682348f7d6ed42 Mon Sep 17 00:00:00 2001
From: Vaibhav Srivastav
Date: Thu, 27 Apr 2023 17:35:28 +0200
Subject: [PATCH 06/13] up
---
bark/generation.py | 22 +++++-----------------
1 file changed, 5 insertions(+), 17 deletions(-)
diff --git a/bark/generation.py b/bark/generation.py
index a3c8a0b..82994e2 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -146,7 +146,7 @@ REMOTE_MODEL_PATHS = {
"checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
},
"fine": {
- "repo_id": "reach-vb/bark-small",
+ "repo_id": "reach-vb/bark",
"file_name": "fine_2.pt",
"checksum": "59d184ed44e3650774a2f0503a48a97b",
},
@@ -199,24 +199,12 @@ def _parse_s3_filepath(s3_filepath):
return bucket_name, rel_s3_filepath
-# def _download(from_s3_path, to_local_path):
-# os.makedirs(CACHE_DIR, exist_ok=True)
-# response = requests.get(from_s3_path, stream=True)
-# total_size_in_bytes = int(response.headers.get("content-length", 0))
-# block_size = 1024
-# progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
-# with open(to_local_path, "wb") as file:
-# for data in response.iter_content(block_size):
-# progress_bar.update(len(data))
-# file.write(data)
-# progress_bar.close()
-# if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
-# raise ValueError("ERROR, something went wrong")
-
-
def _download(from_hf_path, file_name, to_local_path):
os.makedirs(CACHE_DIR, exist_ok=True)
- hf_hub_download(repo_id=from_hf_path, filename=file_name, cache_dir=to_local_path)
+ destination_file_name = to_local_path.split("/")[-1]
+ file_dir = CACHE_DIR
+ hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=file_dir)
+ os.replace(f"{CACHE_DIR}/{file_name}", to_local_path)
class InferenceContext:
def __init__(self, benchmark=False):
From 035d08e157d57d04bd1d5891c6464b151bc5acab Mon Sep 17 00:00:00 2001
From: Vaibhav Srivastav
Date: Thu, 27 Apr 2023 17:45:59 +0200
Subject: [PATCH 07/13] up
---
bark/generation.py | 28 ----------------------------
1 file changed, 28 deletions(-)
diff --git a/bark/generation.py b/bark/generation.py
index 82994e2..ab23479 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -90,34 +90,6 @@ USE_SMALL_MODELS = os.environ.get("SUNO_USE_SMALL_MODELS", False)
GLOBAL_ENABLE_MPS = os.environ.get("SUNO_ENABLE_MPS", False)
OFFLOAD_CPU = os.environ.get("SUNO_OFFLOAD_CPU", False)
-REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
-
-# REMOTE_MODEL_PATHS = {
-# "text_small": {
-# "path": os.path.join(REMOTE_BASE_URL, "text.pt"),
-# "checksum": "b3e42bcbab23b688355cd44128c4cdd3",
-# },
-# "coarse_small": {
-# "path": os.path.join(REMOTE_BASE_URL, "coarse.pt"),
-# "checksum": "5fe964825e3b0321f9d5f3857b89194d",
-# },
-# "fine_small": {
-# "path": os.path.join(REMOTE_BASE_URL, "fine.pt"),
-# "checksum": "5428d1befe05be2ba32195496e58dc90",
-# },
-# "text": {
-# "path": os.path.join(REMOTE_BASE_URL, "text_2.pt"),
-# "checksum": "54afa89d65e318d4f5f80e8e8799026a",
-# },
-# "coarse": {
-# "path": os.path.join(REMOTE_BASE_URL, "coarse_2.pt"),
-# "checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
-# },
-# "fine": {
-# "path": os.path.join(REMOTE_BASE_URL, "fine_2.pt"),
-# "checksum": "59d184ed44e3650774a2f0503a48a97b",
-# },
-# }
REMOTE_MODEL_PATHS = {
"text_small": {
From c61ee92ee926f9248c8c2fe040a09dc117f96b84 Mon Sep 17 00:00:00 2001
From: Vaibhav Srivastav
Date: Thu, 27 Apr 2023 17:51:43 +0200
Subject: [PATCH 08/13] up
---
bark/generation.py | 9 ---------
1 file changed, 9 deletions(-)
diff --git a/bark/generation.py b/bark/generation.py
index ab23479..842b890 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -162,15 +162,6 @@ def _grab_best_device(use_gpu=True):
return device
-S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
-
-
-def _parse_s3_filepath(s3_filepath):
- bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
- rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
- return bucket_name, rel_s3_filepath
-
-
def _download(from_hf_path, file_name, to_local_path):
os.makedirs(CACHE_DIR, exist_ok=True)
destination_file_name = to_local_path.split("/")[-1]
From b24dd26d4b18e355c5a425bec794b30216d1d86e Mon Sep 17 00:00:00 2001
From: Vaibhav Srivastav
Date: Fri, 28 Apr 2023 16:26:09 +0200
Subject: [PATCH 09/13] add suggestions from code review
---
bark/generation.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/bark/generation.py b/bark/generation.py
index 842b890..f6980cc 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -165,9 +165,8 @@ def _grab_best_device(use_gpu=True):
def _download(from_hf_path, file_name, to_local_path):
os.makedirs(CACHE_DIR, exist_ok=True)
destination_file_name = to_local_path.split("/")[-1]
- file_dir = CACHE_DIR
- hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=file_dir)
- os.replace(f"{CACHE_DIR}/{file_name}", to_local_path)
+ hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
+ os.replace(os.path.join(CACHE_DIR, file_name), to_local_path)
class InferenceContext:
def __init__(self, benchmark=False):
From 27ff4f9db86b5cd22e93872704b7bee5b56e353b Mon Sep 17 00:00:00 2001
From: Vaibhav Srivastav
Date: Fri, 28 Apr 2023 17:02:41 +0200
Subject: [PATCH 10/13] new model repo
---
bark/generation.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/bark/generation.py b/bark/generation.py
index f6980cc..1e2c8db 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -93,17 +93,17 @@ OFFLOAD_CPU = os.environ.get("SUNO_OFFLOAD_CPU", False)
REMOTE_MODEL_PATHS = {
"text_small": {
- "repo_id": "reach-vb/bark-small",
+ "repo_id": "reach-vb/bark",
"file_name": "text.pt",
"checksum": "b3e42bcbab23b688355cd44128c4cdd3",
},
"coarse_small": {
- "repo_id": "reach-vb/bark-small",
+ "repo_id": "reach-vb/bark",
"file_name": "coarse.pt",
"checksum": "5fe964825e3b0321f9d5f3857b89194d",
},
"fine_small": {
- "repo_id": "reach-vb/bark-small",
+ "repo_id": "reach-vb/bark",
"file_name": "fine.pt",
"checksum": "5428d1befe05be2ba32195496e58dc90",
},
From e0f2d117f51eeb6426d02f1ff57e9a2f4ab5f3fa Mon Sep 17 00:00:00 2001
From: Vaibhav Srivastav
Date: Fri, 28 Apr 2023 17:54:50 +0200
Subject: [PATCH 11/13] updating model repo organisation reach-vb -> suno
---
bark/generation.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/bark/generation.py b/bark/generation.py
index 1e2c8db..64b3c47 100644
--- a/bark/generation.py
+++ b/bark/generation.py
@@ -93,32 +93,32 @@ OFFLOAD_CPU = os.environ.get("SUNO_OFFLOAD_CPU", False)
REMOTE_MODEL_PATHS = {
"text_small": {
- "repo_id": "reach-vb/bark",
+ "repo_id": "suno/bark",
"file_name": "text.pt",
"checksum": "b3e42bcbab23b688355cd44128c4cdd3",
},
"coarse_small": {
- "repo_id": "reach-vb/bark",
+ "repo_id": "suno/bark",
"file_name": "coarse.pt",
"checksum": "5fe964825e3b0321f9d5f3857b89194d",
},
"fine_small": {
- "repo_id": "reach-vb/bark",
+ "repo_id": "suno/bark",
"file_name": "fine.pt",
"checksum": "5428d1befe05be2ba32195496e58dc90",
},
"text": {
- "repo_id": "reach-vb/bark",
+ "repo_id": "suno/bark",
"file_name": "text_2.pt",
"checksum": "54afa89d65e318d4f5f80e8e8799026a",
},
"coarse": {
- "repo_id": "reach-vb/bark",
+ "repo_id": "suno/bark",
"file_name": "coarse_2.pt",
"checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
},
"fine": {
- "repo_id": "reach-vb/bark",
+ "repo_id": "suno/bark",
"file_name": "fine_2.pt",
"checksum": "59d184ed44e3650774a2f0503a48a97b",
},
From 9fb3494391d78eb2ef0481b092c6d12a3a488163 Mon Sep 17 00:00:00 2001
From: Keenan Freyberg <32879321+kmfreyberg@users.noreply.github.com>
Date: Fri, 28 Apr 2023 17:28:57 -0400
Subject: [PATCH 12/13] Update README.md
---
README.md | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index a44887b..ed1b4d1 100644
--- a/README.md
+++ b/README.md
@@ -13,11 +13,16 @@ Bark is a transformer-based text-to-audio model created by [Suno](https://suno.a
-## 🔊 Demos
+You can try Bark here:
-[](https://huggingface.co/spaces/suno/bark)
+[](https://huggingface.co/spaces/suno/bark)
+[](https://replicate.com/suno-ai/bark)
[](https://colab.research.google.com/drive/1eJfA2XUa-mXwdMy7DoYKVYHI1iTd9Vkt?usp=sharing)
+## 🚀 Updates
+
+
+
## 🤖 Usage
```python
From 39e7305f4ad596032717619a900ec4730d168531 Mon Sep 17 00:00:00 2001
From: Georg Kucsko
Date: Fri, 28 Apr 2023 17:38:54 -0400
Subject: [PATCH 13/13] Revert "Update README.md"
This reverts commit 9fb3494391d78eb2ef0481b092c6d12a3a488163.
---
README.md | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index ed1b4d1..a44887b 100644
--- a/README.md
+++ b/README.md
@@ -13,16 +13,11 @@ Bark is a transformer-based text-to-audio model created by [Suno](https://suno.a
-You can try Bark here:
+## 🔊 Demos
-[](https://huggingface.co/spaces/suno/bark)
-[](https://replicate.com/suno-ai/bark)
+[](https://huggingface.co/spaces/suno/bark)
[](https://colab.research.google.com/drive/1eJfA2XUa-mXwdMy7DoYKVYHI1iTd9Vkt?usp=sharing)
-## 🚀 Updates
-
-
-
## 🤖 Usage
```python