2022-09-20 17:49:31 +08:00
|
|
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
2022-10-20 10:28:15 +08:00
|
|
|
# yapf: disable
|
2023-01-03 16:27:29 +08:00
|
|
|
|
2022-10-20 10:28:15 +08:00
|
|
|
import datetime
|
2022-11-24 15:01:24 +08:00
|
|
|
import functools
|
2025-01-14 23:59:53 +08:00
|
|
|
import io
|
2022-06-21 20:04:25 +08:00
|
|
|
import os
|
|
|
|
|
import pickle
|
2022-10-26 13:55:51 +08:00
|
|
|
import platform
|
2023-05-13 12:12:04 +08:00
|
|
|
import re
|
2022-07-01 11:29:33 +08:00
|
|
|
import shutil
|
2025-02-06 11:09:37 +08:00
|
|
|
import tempfile
|
2022-10-26 13:55:51 +08:00
|
|
|
import uuid
|
2025-03-27 22:46:17 +08:00
|
|
|
import warnings
|
2022-07-01 11:29:33 +08:00
|
|
|
from collections import defaultdict
|
2022-08-02 18:16:28 +08:00
|
|
|
from http import HTTPStatus
|
2022-06-21 20:04:25 +08:00
|
|
|
from http.cookiejar import CookieJar
|
|
|
|
|
from os.path import expanduser
|
2025-01-14 23:59:53 +08:00
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any, BinaryIO, Dict, Iterable, List, Optional, Tuple, Union
|
2024-03-22 17:30:34 +08:00
|
|
|
from urllib.parse import urlencode
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2024-03-22 17:30:34 +08:00
|
|
|
import json
|
2023-04-18 17:01:43 +08:00
|
|
|
import requests
|
2022-11-24 15:01:24 +08:00
|
|
|
from requests import Session
|
|
|
|
|
from requests.adapters import HTTPAdapter, Retry
|
2025-03-04 18:57:04 +08:00
|
|
|
from requests.exceptions import HTTPError
|
2025-01-31 14:37:19 +08:00
|
|
|
from tqdm.auto import tqdm
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2024-08-26 21:20:02 +08:00
|
|
|
from modelscope.hub.constants import (API_HTTP_CLIENT_MAX_RETRIES,
|
|
|
|
|
API_HTTP_CLIENT_TIMEOUT,
|
2022-11-24 15:01:24 +08:00
|
|
|
API_RESPONSE_FIELD_DATA,
|
2022-08-02 18:16:28 +08:00
|
|
|
API_RESPONSE_FIELD_EMAIL,
|
|
|
|
|
API_RESPONSE_FIELD_GIT_ACCESS_TOKEN,
|
|
|
|
|
API_RESPONSE_FIELD_MESSAGE,
|
|
|
|
|
API_RESPONSE_FIELD_USERNAME,
|
2022-10-26 13:55:51 +08:00
|
|
|
DEFAULT_CREDENTIALS_PATH,
|
2025-01-14 23:59:53 +08:00
|
|
|
DEFAULT_MAX_WORKERS,
|
2022-11-18 13:17:19 +08:00
|
|
|
MODELSCOPE_CLOUD_ENVIRONMENT,
|
|
|
|
|
MODELSCOPE_CLOUD_USERNAME,
|
2025-03-04 12:34:11 +08:00
|
|
|
MODELSCOPE_DOMAIN,
|
2025-03-11 17:05:08 +08:00
|
|
|
MODELSCOPE_PREFER_AI_SITE,
|
2025-03-04 12:34:11 +08:00
|
|
|
MODELSCOPE_REQUEST_ID,
|
|
|
|
|
MODELSCOPE_URL_SCHEME, ONE_YEAR_SECONDS,
|
2024-03-22 17:30:34 +08:00
|
|
|
REQUESTS_API_HTTP_METHOD,
|
2024-12-12 16:46:00 +08:00
|
|
|
TEMPORARY_FOLDER_NAME, DatasetVisibility,
|
2025-02-11 20:37:14 +08:00
|
|
|
Licenses, ModelVisibility, Visibility,
|
|
|
|
|
VisibilityMap)
|
2022-10-20 10:28:15 +08:00
|
|
|
from modelscope.hub.errors import (InvalidParameter, NotExistError,
|
2025-01-14 23:59:53 +08:00
|
|
|
NotLoginException, RequestError,
|
|
|
|
|
datahub_raise_on_error,
|
2022-10-20 10:28:15 +08:00
|
|
|
handle_http_post_error,
|
2022-10-26 13:55:51 +08:00
|
|
|
handle_http_response, is_ok,
|
|
|
|
|
raise_for_http_status, raise_on_error)
|
2022-10-20 10:28:15 +08:00
|
|
|
from modelscope.hub.git import GitCommandWrapper
|
|
|
|
|
from modelscope.hub.repository import Repository
|
2025-03-04 12:34:11 +08:00
|
|
|
from modelscope.hub.utils.utils import (add_content_to_file, get_domain,
|
|
|
|
|
get_endpoint, get_readable_folder_size,
|
|
|
|
|
get_release_datetime, is_env_true,
|
2025-01-14 23:59:53 +08:00
|
|
|
model_id_to_group_owner_name)
|
2022-07-25 13:29:26 +08:00
|
|
|
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
|
2022-07-29 12:22:48 +08:00
|
|
|
DEFAULT_MODEL_REVISION,
|
2022-10-26 13:55:51 +08:00
|
|
|
DEFAULT_REPOSITORY_REVISION,
|
2023-06-27 11:58:19 +08:00
|
|
|
MASTER_MODEL_BRANCH, META_FILES_FORMAT,
|
2025-01-14 23:59:53 +08:00
|
|
|
REPO_TYPE_DATASET, REPO_TYPE_MODEL,
|
|
|
|
|
REPO_TYPE_SUPPORT, ConfigFields,
|
2023-06-27 11:58:19 +08:00
|
|
|
DatasetFormations, DatasetMetaFormats,
|
2025-02-11 20:37:14 +08:00
|
|
|
DownloadChannel, DownloadMode,
|
|
|
|
|
Frameworks, ModelFile, Tasks,
|
|
|
|
|
VirgoDatasetConfig)
|
2025-01-14 23:59:53 +08:00
|
|
|
from modelscope.utils.file_utils import get_file_hash, get_file_size
|
2022-06-21 20:04:25 +08:00
|
|
|
from modelscope.utils.logger import get_logger
|
2025-01-14 23:59:53 +08:00
|
|
|
from modelscope.utils.repo_utils import (DATASET_LFS_SUFFIX,
|
|
|
|
|
DEFAULT_IGNORE_PATTERNS,
|
|
|
|
|
MODEL_LFS_SUFFIX, CommitInfo,
|
|
|
|
|
CommitOperation, CommitOperationAdd,
|
|
|
|
|
RepoUtils)
|
|
|
|
|
from modelscope.utils.thread_utils import thread_executor
|
2022-06-21 20:04:25 +08:00
|
|
|
|
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HubApi:
|
2023-01-03 16:27:29 +08:00
|
|
|
"""Model hub api interface.
|
|
|
|
|
"""
|
2025-02-06 11:09:37 +08:00
|
|
|
|
2024-08-26 21:20:02 +08:00
|
|
|
def __init__(self,
|
|
|
|
|
endpoint: Optional[str] = None,
|
|
|
|
|
timeout=API_HTTP_CLIENT_TIMEOUT,
|
|
|
|
|
max_retries=API_HTTP_CLIENT_MAX_RETRIES):
|
2023-01-03 16:27:29 +08:00
|
|
|
"""The ModelScope HubApi。
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2023-01-03 16:27:29 +08:00
|
|
|
Args:
|
|
|
|
|
endpoint (str, optional): The modelscope server http|https address. Defaults to None.
|
|
|
|
|
"""
|
2022-06-21 20:04:25 +08:00
|
|
|
self.endpoint = endpoint if endpoint is not None else get_endpoint()
|
2022-10-26 13:55:51 +08:00
|
|
|
self.headers = {'user-agent': ModelScopeConfig.get_user_agent()}
|
2022-11-24 15:01:24 +08:00
|
|
|
self.session = Session()
|
2023-01-03 16:27:29 +08:00
|
|
|
retry = Retry(
|
2024-08-26 21:20:02 +08:00
|
|
|
total=max_retries,
|
2023-01-03 16:27:29 +08:00
|
|
|
read=2,
|
|
|
|
|
connect=2,
|
|
|
|
|
backoff_factor=1,
|
|
|
|
|
status_forcelist=(500, 502, 503, 504),
|
2024-07-12 19:06:44 +08:00
|
|
|
respect_retry_after_header=False,
|
2023-01-03 16:27:29 +08:00
|
|
|
)
|
2022-11-24 15:01:24 +08:00
|
|
|
adapter = HTTPAdapter(max_retries=retry)
|
|
|
|
|
self.session.mount('http://', adapter)
|
|
|
|
|
self.session.mount('https://', adapter)
|
|
|
|
|
# set http timeout
|
|
|
|
|
for method in REQUESTS_API_HTTP_METHOD:
|
2023-01-03 16:27:29 +08:00
|
|
|
setattr(
|
|
|
|
|
self.session, method,
|
|
|
|
|
functools.partial(
|
|
|
|
|
getattr(self.session, method),
|
2024-07-01 15:52:16 +08:00
|
|
|
timeout=timeout))
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2025-01-14 23:59:53 +08:00
|
|
|
self.upload_checker = UploadingCheck()
|
|
|
|
|
|
2025-02-24 22:59:06 +08:00
|
|
|
def get_cookies(self, access_token):
|
|
|
|
|
from requests.cookies import RequestsCookieJar
|
|
|
|
|
jar = RequestsCookieJar()
|
|
|
|
|
jar.set('m_session_id',
|
|
|
|
|
access_token,
|
2025-03-04 12:34:11 +08:00
|
|
|
domain=get_domain(),
|
2025-02-24 22:59:06 +08:00
|
|
|
path='/')
|
|
|
|
|
return jar
|
|
|
|
|
|
2022-06-21 20:04:25 +08:00
|
|
|
def login(
|
2025-02-06 11:09:37 +08:00
|
|
|
self,
|
2025-03-04 12:34:11 +08:00
|
|
|
access_token: Optional[str] = None,
|
|
|
|
|
endpoint: Optional[str] = None
|
2024-05-25 14:21:55 +08:00
|
|
|
):
|
2023-01-03 16:27:29 +08:00
|
|
|
"""Login with your SDK access token, which can be obtained from
|
|
|
|
|
https://www.modelscope.cn user center.
|
2022-06-21 20:04:25 +08:00
|
|
|
|
|
|
|
|
Args:
|
2025-02-06 11:09:37 +08:00
|
|
|
access_token (str): user access token on modelscope, set this argument or set `MODELSCOPE_API_TOKEN`.
|
|
|
|
|
If neither of the tokens exist, login will directly return.
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: the endpoint to use, default to None to use endpoint specified in the class
|
2023-01-03 16:27:29 +08:00
|
|
|
|
2022-06-21 20:04:25 +08:00
|
|
|
Returns:
|
|
|
|
|
cookies: to authenticate yourself to ModelScope open-api
|
2023-01-03 16:27:29 +08:00
|
|
|
git_token: token to access your git repository.
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2023-01-03 16:27:29 +08:00
|
|
|
Note:
|
2022-06-21 20:04:25 +08:00
|
|
|
You only have to login once within 30 days.
|
|
|
|
|
"""
|
2025-02-06 11:09:37 +08:00
|
|
|
if access_token is None:
|
|
|
|
|
access_token = os.environ.get('MODELSCOPE_API_TOKEN')
|
|
|
|
|
if not access_token:
|
|
|
|
|
return None, None
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
path = f'{endpoint}/api/v1/login'
|
2022-11-24 15:01:24 +08:00
|
|
|
r = self.session.post(
|
2023-09-20 19:29:30 +08:00
|
|
|
path,
|
|
|
|
|
json={'AccessToken': access_token},
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2022-10-26 13:55:51 +08:00
|
|
|
raise_for_http_status(r)
|
2022-06-21 20:04:25 +08:00
|
|
|
d = r.json()
|
|
|
|
|
raise_on_error(d)
|
|
|
|
|
|
2022-08-02 18:16:28 +08:00
|
|
|
token = d[API_RESPONSE_FIELD_DATA][API_RESPONSE_FIELD_GIT_ACCESS_TOKEN]
|
2022-06-21 20:04:25 +08:00
|
|
|
cookies = r.cookies
|
|
|
|
|
|
|
|
|
|
# save token and cookie
|
|
|
|
|
ModelScopeConfig.save_token(token)
|
|
|
|
|
ModelScopeConfig.save_cookies(cookies)
|
2022-08-02 18:16:28 +08:00
|
|
|
ModelScopeConfig.save_user_info(
|
|
|
|
|
d[API_RESPONSE_FIELD_DATA][API_RESPONSE_FIELD_USERNAME],
|
|
|
|
|
d[API_RESPONSE_FIELD_DATA][API_RESPONSE_FIELD_EMAIL])
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2022-08-02 18:16:28 +08:00
|
|
|
return d[API_RESPONSE_FIELD_DATA][
|
|
|
|
|
API_RESPONSE_FIELD_GIT_ACCESS_TOKEN], cookies
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2023-01-03 16:27:29 +08:00
|
|
|
def create_model(self,
|
|
|
|
|
model_id: str,
|
|
|
|
|
visibility: Optional[int] = ModelVisibility.PUBLIC,
|
|
|
|
|
license: Optional[str] = Licenses.APACHE_V2,
|
2023-04-10 18:17:52 +08:00
|
|
|
chinese_name: Optional[str] = None,
|
2025-03-04 12:34:11 +08:00
|
|
|
original_model_id: Optional[str] = '',
|
|
|
|
|
endpoint: Optional[str] = None) -> str:
|
2023-03-02 11:06:56 +08:00
|
|
|
"""Create model repo at ModelScope Hub.
|
2022-06-21 20:04:25 +08:00
|
|
|
|
|
|
|
|
Args:
|
2023-01-03 16:27:29 +08:00
|
|
|
model_id (str): The model id
|
|
|
|
|
visibility (int, optional): visibility of the model(1-private, 5-public), default 5.
|
|
|
|
|
license (str, optional): license of the model, default none.
|
|
|
|
|
chinese_name (str, optional): chinese name of the model.
|
2023-04-10 18:17:52 +08:00
|
|
|
original_model_id (str, optional): the base model id which this model is trained from
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: the endpoint to use, default to None to use endpoint specified in the class
|
2023-01-03 16:27:29 +08:00
|
|
|
|
2022-06-21 20:04:25 +08:00
|
|
|
Returns:
|
2023-01-03 16:27:29 +08:00
|
|
|
Name of the model created
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
InvalidParameter: If model_id is invalid.
|
|
|
|
|
ValueError: If not login.
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2023-01-03 16:27:29 +08:00
|
|
|
Note:
|
2022-06-21 20:04:25 +08:00
|
|
|
model_id = {owner}/{name}
|
|
|
|
|
"""
|
2022-06-28 21:12:15 +08:00
|
|
|
if model_id is None:
|
|
|
|
|
raise InvalidParameter('model_id is required!')
|
2022-06-21 20:04:25 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
if cookies is None:
|
|
|
|
|
raise ValueError('Token does not exist, please login first.')
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
path = f'{endpoint}/api/v1/models'
|
2022-06-21 20:04:25 +08:00
|
|
|
owner_or_group, name = model_id_to_group_owner_name(model_id)
|
2022-10-13 18:30:06 +08:00
|
|
|
body = {
|
|
|
|
|
'Path': owner_or_group,
|
|
|
|
|
'Name': name,
|
|
|
|
|
'ChineseName': chinese_name,
|
|
|
|
|
'Visibility': visibility, # server check
|
2023-04-10 18:17:52 +08:00
|
|
|
'License': license,
|
|
|
|
|
'OriginalModelId': original_model_id,
|
2023-05-13 12:12:04 +08:00
|
|
|
'TrainId': os.environ.get('MODELSCOPE_TRAIN_ID', ''),
|
2022-10-13 18:30:06 +08:00
|
|
|
}
|
2022-11-24 15:01:24 +08:00
|
|
|
r = self.session.post(
|
2023-09-20 19:29:30 +08:00
|
|
|
path,
|
|
|
|
|
json=body,
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2022-10-13 18:30:06 +08:00
|
|
|
handle_http_post_error(r, path, body)
|
2022-06-21 20:04:25 +08:00
|
|
|
raise_on_error(r.json())
|
2025-04-07 10:24:26 +08:00
|
|
|
model_repo_url = f'{endpoint}/models/{model_id}'
|
2022-06-24 16:43:32 +08:00
|
|
|
return model_repo_url
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
def delete_model(self, model_id: str, endpoint: Optional[str] = None):
|
2023-01-03 16:27:29 +08:00
|
|
|
"""Delete model_id from ModelScope.
|
2022-06-21 20:04:25 +08:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
model_id (str): The model id.
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: the endpoint to use, default to None to use endpoint specified in the class
|
2023-01-03 16:27:29 +08:00
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If not login.
|
|
|
|
|
|
|
|
|
|
Note:
|
2022-06-21 20:04:25 +08:00
|
|
|
model_id = {owner}/{name}
|
|
|
|
|
"""
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
2022-07-08 20:03:23 +08:00
|
|
|
if cookies is None:
|
|
|
|
|
raise ValueError('Token does not exist, please login first.')
|
2025-03-04 12:34:11 +08:00
|
|
|
path = f'{endpoint}/api/v1/models/{model_id}'
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2023-09-20 19:29:30 +08:00
|
|
|
r = self.session.delete(path,
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2022-10-26 13:55:51 +08:00
|
|
|
raise_for_http_status(r)
|
2022-06-21 20:04:25 +08:00
|
|
|
raise_on_error(r.json())
|
|
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
def get_model_url(self, model_id: str, endpoint: Optional[str] = None):
|
|
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
return f'{endpoint}/api/v1/models/{model_id}.git'
|
2022-06-21 20:04:25 +08:00
|
|
|
|
|
|
|
|
def get_model(
|
2025-02-06 11:09:37 +08:00
|
|
|
self,
|
|
|
|
|
model_id: str,
|
|
|
|
|
revision: Optional[str] = DEFAULT_MODEL_REVISION,
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: Optional[str] = None
|
2022-06-21 20:04:25 +08:00
|
|
|
) -> str:
|
2023-01-03 16:27:29 +08:00
|
|
|
"""Get model information at ModelScope
|
2022-06-21 20:04:25 +08:00
|
|
|
|
|
|
|
|
Args:
|
2023-01-03 16:27:29 +08:00
|
|
|
model_id (str): The model id.
|
|
|
|
|
revision (str optional): revision of model.
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: the endpoint to use, default to None to use endpoint specified in the class
|
2023-01-03 16:27:29 +08:00
|
|
|
|
2022-06-21 20:04:25 +08:00
|
|
|
Returns:
|
2022-07-08 20:20:42 +08:00
|
|
|
The model detail information.
|
2023-01-03 16:27:29 +08:00
|
|
|
|
2022-06-21 20:04:25 +08:00
|
|
|
Raises:
|
|
|
|
|
NotExistError: If the model is not exist, will throw NotExistError
|
2023-01-03 16:27:29 +08:00
|
|
|
|
|
|
|
|
Note:
|
2022-06-21 20:04:25 +08:00
|
|
|
model_id = {owner}/{name}
|
|
|
|
|
"""
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
owner_or_group, name = model_id_to_group_owner_name(model_id)
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
|
2022-10-27 17:06:18 +08:00
|
|
|
if revision:
|
2025-03-04 12:34:11 +08:00
|
|
|
path = f'{endpoint}/api/v1/models/{owner_or_group}/{name}?Revision={revision}'
|
2022-10-27 17:06:18 +08:00
|
|
|
else:
|
2025-03-04 12:34:11 +08:00
|
|
|
path = f'{endpoint}/api/v1/models/{owner_or_group}/{name}'
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2023-09-20 19:29:30 +08:00
|
|
|
r = self.session.get(path, cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2022-07-08 20:03:23 +08:00
|
|
|
handle_http_response(r, logger, cookies, model_id)
|
2022-08-02 18:16:28 +08:00
|
|
|
if r.status_code == HTTPStatus.OK:
|
2022-06-21 20:04:25 +08:00
|
|
|
if is_ok(r.json()):
|
2022-08-02 18:16:28 +08:00
|
|
|
return r.json()[API_RESPONSE_FIELD_DATA]
|
2022-06-21 20:04:25 +08:00
|
|
|
else:
|
2022-08-02 18:16:28 +08:00
|
|
|
raise NotExistError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
2022-06-21 20:04:25 +08:00
|
|
|
else:
|
2022-10-26 13:55:51 +08:00
|
|
|
raise_for_http_status(r)
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
def get_endpoint_for_read(self,
|
|
|
|
|
repo_id: str,
|
|
|
|
|
*,
|
|
|
|
|
repo_type: Optional[str] = None) -> str:
|
|
|
|
|
"""Get proper endpoint for read operation (such as download, list etc.)
|
|
|
|
|
1. If user has set MODELSCOPE_DOMAIN, construct endpoint with user-specified domain.
|
|
|
|
|
If the repo does not exist on that endpoint, throw 404 error, otherwise return the endpoint.
|
2025-03-11 17:05:08 +08:00
|
|
|
2. If domain is not set, check existence of repo in cn-site and ai-site (intl version) respectively.
|
|
|
|
|
Checking order is determined by MODELSCOPE_PREFER_AI_SITE.
|
|
|
|
|
a. if MODELSCOPE_PREFER_AI_SITE is not set ,check cn-site first before ai-site (intl version)
|
|
|
|
|
b. otherwise check ai-site before cn-site
|
2025-03-04 12:34:11 +08:00
|
|
|
return the endpoint with which the given repo_id exists.
|
|
|
|
|
if neither exists, throw 404 error
|
|
|
|
|
"""
|
2025-03-13 14:30:08 +08:00
|
|
|
s = os.environ.get(MODELSCOPE_DOMAIN)
|
|
|
|
|
if s is not None and s.strip() != '':
|
2025-03-13 17:04:27 +08:00
|
|
|
endpoint = MODELSCOPE_URL_SCHEME + s
|
2025-03-04 18:57:04 +08:00
|
|
|
try:
|
|
|
|
|
self.repo_exists(repo_id=repo_id, repo_type=repo_type, endpoint=endpoint, re_raise=True)
|
|
|
|
|
except Exception:
|
2025-03-13 17:04:27 +08:00
|
|
|
logger.error(f'Repo {repo_id} does not exist on {endpoint}.')
|
2025-03-04 18:57:04 +08:00
|
|
|
raise
|
|
|
|
|
return endpoint
|
2025-03-04 12:34:11 +08:00
|
|
|
|
2025-03-11 17:05:08 +08:00
|
|
|
check_cn_first = not is_env_true(MODELSCOPE_PREFER_AI_SITE)
|
2025-03-04 12:34:11 +08:00
|
|
|
prefer_endpoint = get_endpoint(cn_site=check_cn_first)
|
|
|
|
|
if not self.repo_exists(
|
|
|
|
|
repo_id, repo_type=repo_type, endpoint=prefer_endpoint):
|
|
|
|
|
alternative_endpoint = get_endpoint(cn_site=(not check_cn_first))
|
2025-03-11 17:05:08 +08:00
|
|
|
logger.warning(f'Repo {repo_id} not exists on {prefer_endpoint}, '
|
|
|
|
|
f'will try on alternative endpoint {alternative_endpoint}.')
|
2025-03-04 18:57:04 +08:00
|
|
|
try:
|
2025-03-11 17:05:08 +08:00
|
|
|
self.repo_exists(
|
2025-03-04 18:57:04 +08:00
|
|
|
repo_id, repo_type=repo_type, endpoint=alternative_endpoint, re_raise=True)
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.error(f'Repo {repo_id} not exists on either {prefer_endpoint} or {alternative_endpoint}')
|
|
|
|
|
raise
|
2025-03-04 12:34:11 +08:00
|
|
|
else:
|
|
|
|
|
return alternative_endpoint
|
|
|
|
|
else:
|
|
|
|
|
return prefer_endpoint
|
|
|
|
|
|
2024-11-02 08:04:40 +08:00
|
|
|
def repo_exists(
|
2025-02-06 11:09:37 +08:00
|
|
|
self,
|
|
|
|
|
repo_id: str,
|
|
|
|
|
*,
|
|
|
|
|
repo_type: Optional[str] = None,
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: Optional[str] = None,
|
2025-03-04 18:57:04 +08:00
|
|
|
re_raise: Optional[bool] = False
|
2024-11-02 08:04:40 +08:00
|
|
|
) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Checks if a repository exists on ModelScope
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
repo_id (`str`):
|
|
|
|
|
A namespace (user or an organization) and a repo name separated
|
|
|
|
|
by a `/`.
|
|
|
|
|
repo_type (`str`, *optional*):
|
|
|
|
|
`None` or `"model"` if getting repository info from a model. Default is `None`.
|
|
|
|
|
TODO: support dataset and studio
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint(`str`):
|
|
|
|
|
None or specific endpoint to use, when None, use the default endpoint
|
|
|
|
|
set in HubApi class (self.endpoint)
|
2025-03-04 18:57:04 +08:00
|
|
|
re_raise(`bool`):
|
|
|
|
|
raise exception when error
|
2024-11-02 08:04:40 +08:00
|
|
|
Returns:
|
|
|
|
|
True if the repository exists, False otherwise.
|
|
|
|
|
"""
|
2025-03-04 12:34:11 +08:00
|
|
|
if endpoint is None:
|
|
|
|
|
endpoint = self.endpoint
|
2025-03-04 15:18:37 +08:00
|
|
|
if (repo_type is not None) and repo_type.lower() not in REPO_TYPE_SUPPORT:
|
2024-11-02 08:04:40 +08:00
|
|
|
raise Exception('Not support repo-type: %s' % repo_type)
|
|
|
|
|
if (repo_id is None) or repo_id.count('/') != 1:
|
|
|
|
|
raise Exception('Invalid repo_id: %s, must be of format namespace/name' % repo_type)
|
|
|
|
|
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
owner_or_group, name = model_id_to_group_owner_name(repo_id)
|
2025-03-04 15:18:37 +08:00
|
|
|
if (repo_type is not None) and repo_type.lower() == REPO_TYPE_DATASET:
|
|
|
|
|
path = f'{endpoint}/api/v1/datasets/{owner_or_group}/{name}'
|
|
|
|
|
else:
|
|
|
|
|
path = f'{endpoint}/api/v1/models/{owner_or_group}/{name}'
|
2024-11-02 08:04:40 +08:00
|
|
|
|
|
|
|
|
r = self.session.get(path, cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
|
|
|
|
code = handle_http_response(r, logger, cookies, repo_id, False)
|
|
|
|
|
if code == 200:
|
|
|
|
|
return True
|
|
|
|
|
elif code == 404:
|
2025-03-04 18:57:04 +08:00
|
|
|
if re_raise:
|
|
|
|
|
raise HTTPError(r)
|
|
|
|
|
else:
|
|
|
|
|
return False
|
2024-11-02 08:04:40 +08:00
|
|
|
else:
|
2024-11-04 13:33:23 +08:00
|
|
|
logger.warn(f'Check repo_exists return status code {code}.')
|
2024-11-02 08:04:40 +08:00
|
|
|
raise Exception(
|
|
|
|
|
'Failed to check existence of repo: %s, make sure you have access authorization.'
|
|
|
|
|
% repo_type)
|
|
|
|
|
|
2025-04-07 10:24:26 +08:00
|
|
|
def delete_repo(self, repo_id: str, repo_type: str, endpoint: Optional[str] = None):
|
|
|
|
|
"""
|
|
|
|
|
Delete a repository from ModelScope.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
repo_id (`str`):
|
|
|
|
|
A namespace (user or an organization) and a repo name separated
|
|
|
|
|
by a `/`.
|
|
|
|
|
repo_type (`str`):
|
|
|
|
|
The type of the repository. Supported types are `model` and `dataset`.
|
|
|
|
|
endpoint(`str`):
|
|
|
|
|
The endpoint to use. If not provided, the default endpoint is `https://www.modelscope.cn`
|
|
|
|
|
Could be set to `https://ai.modelscope.ai` for international version.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
|
|
|
|
|
if repo_type == REPO_TYPE_DATASET:
|
|
|
|
|
self.delete_dataset(repo_id, endpoint)
|
|
|
|
|
elif repo_type == REPO_TYPE_MODEL:
|
|
|
|
|
self.delete_model(repo_id, endpoint)
|
|
|
|
|
else:
|
|
|
|
|
raise Exception(f'Arg repo_type {repo_type} not supported.')
|
|
|
|
|
|
|
|
|
|
logger.info(f'Repo {repo_id} deleted successfully.')
|
|
|
|
|
|
2024-11-04 13:33:23 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def _create_default_config(model_dir):
|
|
|
|
|
cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
|
|
|
|
|
cfg = {
|
|
|
|
|
ConfigFields.framework: Frameworks.torch,
|
|
|
|
|
ConfigFields.task: Tasks.other,
|
|
|
|
|
}
|
2024-11-14 16:41:24 +08:00
|
|
|
with open(cfg_file, 'w') as file:
|
|
|
|
|
json.dump(cfg, file)
|
2024-11-04 13:33:23 +08:00
|
|
|
|
2022-10-20 10:28:15 +08:00
|
|
|
def push_model(self,
|
|
|
|
|
model_id: str,
|
|
|
|
|
model_dir: str,
|
2023-01-03 16:27:29 +08:00
|
|
|
visibility: Optional[int] = ModelVisibility.PUBLIC,
|
|
|
|
|
license: Optional[str] = Licenses.APACHE_V2,
|
2022-10-20 10:28:15 +08:00
|
|
|
chinese_name: Optional[str] = None,
|
|
|
|
|
commit_message: Optional[str] = 'upload model',
|
2023-05-13 12:12:04 +08:00
|
|
|
tag: Optional[str] = None,
|
2023-04-10 18:17:52 +08:00
|
|
|
revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
|
2023-05-13 12:12:04 +08:00
|
|
|
original_model_id: Optional[str] = None,
|
2023-09-26 21:15:41 +08:00
|
|
|
ignore_file_pattern: Optional[Union[List[str], str]] = None,
|
|
|
|
|
lfs_suffix: Optional[Union[str, List[str]]] = None):
|
2025-03-27 22:46:17 +08:00
|
|
|
warnings.warn(
|
|
|
|
|
'This function is deprecated and will be removed in future versions. '
|
|
|
|
|
'Please use git command directly or use HubApi().upload_folder instead',
|
|
|
|
|
DeprecationWarning,
|
|
|
|
|
stacklevel=2
|
|
|
|
|
)
|
2023-01-03 16:27:29 +08:00
|
|
|
"""Upload model from a given directory to given repository. A valid model directory
|
2022-10-20 10:28:15 +08:00
|
|
|
must contain a configuration.json file.
|
|
|
|
|
|
|
|
|
|
This function upload the files in given directory to given repository. If the
|
|
|
|
|
given repository is not exists in remote, it will automatically create it with
|
|
|
|
|
given visibility, license and chinese_name parameters. If the revision is also
|
|
|
|
|
not exists in remote repository, it will create a new branch for it.
|
|
|
|
|
|
|
|
|
|
This function must be called before calling HubApi's login with a valid token
|
|
|
|
|
which can be obtained from ModelScope's website.
|
|
|
|
|
|
2024-05-23 20:34:52 +08:00
|
|
|
If any error, please upload via git commands.
|
|
|
|
|
|
2022-10-20 10:28:15 +08:00
|
|
|
Args:
|
2023-01-03 16:27:29 +08:00
|
|
|
model_id (str):
|
2022-10-20 10:28:15 +08:00
|
|
|
The model id to be uploaded, caller must have write permission for it.
|
2023-01-03 16:27:29 +08:00
|
|
|
model_dir(str):
|
2022-10-20 10:28:15 +08:00
|
|
|
The Absolute Path of the finetune result.
|
2023-01-03 16:27:29 +08:00
|
|
|
visibility(int, optional):
|
2022-10-20 10:28:15 +08:00
|
|
|
Visibility of the new created model(1-private, 5-public). If the model is
|
|
|
|
|
not exists in ModelScope, this function will create a new model with this
|
|
|
|
|
visibility and this parameter is required. You can ignore this parameter
|
|
|
|
|
if you make sure the model's existence.
|
|
|
|
|
license(`str`, defaults to `None`):
|
|
|
|
|
License of the new created model(see License). If the model is not exists
|
|
|
|
|
in ModelScope, this function will create a new model with this license
|
|
|
|
|
and this parameter is required. You can ignore this parameter if you
|
|
|
|
|
make sure the model's existence.
|
|
|
|
|
chinese_name(`str`, *optional*, defaults to `None`):
|
|
|
|
|
chinese name of the new created model.
|
|
|
|
|
commit_message(`str`, *optional*, defaults to `None`):
|
|
|
|
|
commit message of the push request.
|
2023-05-13 12:12:04 +08:00
|
|
|
tag(`str`, *optional*, defaults to `None`):
|
|
|
|
|
The tag on this commit
|
2022-10-20 10:28:15 +08:00
|
|
|
revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
|
|
|
|
|
which branch to push. If the branch is not exists, It will create a new
|
|
|
|
|
branch and push to it.
|
2023-04-10 18:17:52 +08:00
|
|
|
original_model_id (str, optional): The base model id which this model is trained from
|
2023-05-13 12:12:04 +08:00
|
|
|
ignore_file_pattern (`Union[List[str], str]`, optional): The file pattern to ignore uploading
|
2023-09-26 21:15:41 +08:00
|
|
|
lfs_suffix (`List[str]`, optional): File types to use LFS to manage. examples: '*.safetensors'.
|
2023-01-03 16:27:29 +08:00
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
InvalidParameter: Parameter invalid.
|
|
|
|
|
NotLoginException: Not login
|
|
|
|
|
ValueError: No configuration.json
|
|
|
|
|
Exception: Create failed.
|
2022-10-20 10:28:15 +08:00
|
|
|
"""
|
|
|
|
|
if model_id is None:
|
|
|
|
|
raise InvalidParameter('model_id cannot be empty!')
|
|
|
|
|
if model_dir is None:
|
|
|
|
|
raise InvalidParameter('model_dir cannot be empty!')
|
|
|
|
|
if not os.path.exists(model_dir) or os.path.isfile(model_dir):
|
|
|
|
|
raise InvalidParameter('model_dir must be a valid directory.')
|
|
|
|
|
cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
|
|
|
|
|
if not os.path.exists(cfg_file):
|
2024-11-04 13:33:23 +08:00
|
|
|
logger.warning(
|
|
|
|
|
f'No {ModelFile.CONFIGURATION} file found in {model_dir}, creating a default one.')
|
|
|
|
|
HubApi._create_default_config(model_dir)
|
|
|
|
|
|
2022-10-20 10:28:15 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
if cookies is None:
|
|
|
|
|
raise NotLoginException('Must login before upload!')
|
|
|
|
|
files_to_save = os.listdir(model_dir)
|
2024-11-04 13:33:23 +08:00
|
|
|
folder_size = get_readable_folder_size(model_dir)
|
2023-05-13 12:12:04 +08:00
|
|
|
if ignore_file_pattern is None:
|
|
|
|
|
ignore_file_pattern = []
|
|
|
|
|
if isinstance(ignore_file_pattern, str):
|
|
|
|
|
ignore_file_pattern = [ignore_file_pattern]
|
2024-11-04 13:33:23 +08:00
|
|
|
if visibility is None or license is None:
|
|
|
|
|
raise InvalidParameter('Visibility and License cannot be empty for new model.')
|
|
|
|
|
if not self.repo_exists(model_id):
|
|
|
|
|
logger.info('Creating new model [%s]' % model_id)
|
2022-10-20 10:28:15 +08:00
|
|
|
self.create_model(
|
|
|
|
|
model_id=model_id,
|
|
|
|
|
visibility=visibility,
|
|
|
|
|
license=license,
|
2023-04-10 18:17:52 +08:00
|
|
|
chinese_name=chinese_name,
|
|
|
|
|
original_model_id=original_model_id)
|
2024-12-12 16:46:00 +08:00
|
|
|
tmp_dir = os.path.join(model_dir, TEMPORARY_FOLDER_NAME) # make temporary folder
|
2022-10-20 10:28:15 +08:00
|
|
|
git_wrapper = GitCommandWrapper()
|
2024-11-04 13:33:23 +08:00
|
|
|
logger.info(f'Pushing folder {model_dir} as model {model_id}.')
|
|
|
|
|
logger.info(f'Total folder size {folder_size}, this may take a while depending on actual pushing size...')
|
2022-10-20 10:28:15 +08:00
|
|
|
try:
|
|
|
|
|
repo = Repository(model_dir=tmp_dir, clone_from=model_id)
|
|
|
|
|
branches = git_wrapper.get_remote_branches(tmp_dir)
|
|
|
|
|
if revision not in branches:
|
2024-11-04 13:33:23 +08:00
|
|
|
logger.info('Creating new branch %s' % revision)
|
2022-10-20 10:28:15 +08:00
|
|
|
git_wrapper.new_branch(tmp_dir, revision)
|
|
|
|
|
git_wrapper.checkout(tmp_dir, revision)
|
2022-10-26 19:15:43 +08:00
|
|
|
files_in_repo = os.listdir(tmp_dir)
|
|
|
|
|
for f in files_in_repo:
|
|
|
|
|
if f[0] != '.':
|
|
|
|
|
src = os.path.join(tmp_dir, f)
|
|
|
|
|
if os.path.isfile(src):
|
|
|
|
|
os.remove(src)
|
|
|
|
|
else:
|
|
|
|
|
shutil.rmtree(src, ignore_errors=True)
|
2022-10-20 10:28:15 +08:00
|
|
|
for f in files_to_save:
|
|
|
|
|
if f[0] != '.':
|
2023-05-13 12:12:04 +08:00
|
|
|
if any([re.search(pattern, f) is not None for pattern in ignore_file_pattern]):
|
|
|
|
|
continue
|
2022-10-20 10:28:15 +08:00
|
|
|
src = os.path.join(model_dir, f)
|
|
|
|
|
if os.path.isdir(src):
|
|
|
|
|
shutil.copytree(src, os.path.join(tmp_dir, f))
|
|
|
|
|
else:
|
|
|
|
|
shutil.copy(src, tmp_dir)
|
|
|
|
|
if not commit_message:
|
|
|
|
|
date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
|
|
|
|
|
commit_message = '[automsg] push model %s to hub at %s' % (
|
|
|
|
|
model_id, date)
|
2023-09-26 21:15:41 +08:00
|
|
|
if lfs_suffix is not None:
|
|
|
|
|
lfs_suffix_list = [lfs_suffix] if isinstance(lfs_suffix, str) else lfs_suffix
|
|
|
|
|
for suffix in lfs_suffix_list:
|
|
|
|
|
repo.add_lfs_type(suffix)
|
2023-01-03 16:27:29 +08:00
|
|
|
repo.push(
|
|
|
|
|
commit_message=commit_message,
|
|
|
|
|
local_branch=revision,
|
|
|
|
|
remote_branch=revision)
|
2023-05-13 12:12:04 +08:00
|
|
|
if tag is not None:
|
|
|
|
|
repo.tag_and_push(tag, tag)
|
2024-12-12 16:46:00 +08:00
|
|
|
logger.info(f'Successfully push folder {model_dir} to remote repo [{model_id}].')
|
2022-10-20 10:28:15 +08:00
|
|
|
except Exception:
|
|
|
|
|
raise
|
|
|
|
|
finally:
|
|
|
|
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
|
|
|
|
|
|
|
|
def list_models(self,
|
|
|
|
|
owner_or_group: str,
|
2023-01-03 16:27:29 +08:00
|
|
|
page_number: Optional[int] = 1,
|
2025-03-04 12:34:11 +08:00
|
|
|
page_size: Optional[int] = 10,
|
|
|
|
|
endpoint: Optional[str] = None) -> dict:
|
2022-10-20 10:28:15 +08:00
|
|
|
"""List models in owner or group.
|
2022-07-22 22:46:01 +08:00
|
|
|
|
|
|
|
|
Args:
|
2023-01-03 16:27:29 +08:00
|
|
|
owner_or_group(str): owner or group.
|
|
|
|
|
page_number(int, optional): The page number, default: 1
|
|
|
|
|
page_size(int, optional): The page size, default: 10
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: the endpoint to use, default to None to use endpoint specified in the class
|
2023-01-03 16:27:29 +08:00
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
RequestError: The request error.
|
|
|
|
|
|
2022-07-22 22:46:01 +08:00
|
|
|
Returns:
|
|
|
|
|
dict: {"models": "list of models", "TotalCount": total_number_of_models_in_owner_or_group}
|
|
|
|
|
"""
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
path = f'{endpoint}/api/v1/models/'
|
2022-11-24 15:01:24 +08:00
|
|
|
r = self.session.put(
|
2022-07-22 22:46:01 +08:00
|
|
|
path,
|
|
|
|
|
data='{"Path":"%s", "PageNumber":%s, "PageSize": %s}' %
|
2025-02-06 11:09:37 +08:00
|
|
|
(owner_or_group, page_number, page_size),
|
2022-10-26 13:55:51 +08:00
|
|
|
cookies=cookies,
|
2023-09-20 19:29:30 +08:00
|
|
|
headers=self.builder_headers(self.headers))
|
2024-05-28 14:38:19 +08:00
|
|
|
handle_http_response(r, logger, cookies, owner_or_group)
|
2022-08-02 18:16:28 +08:00
|
|
|
if r.status_code == HTTPStatus.OK:
|
2022-07-22 22:46:01 +08:00
|
|
|
if is_ok(r.json()):
|
2022-08-02 18:16:28 +08:00
|
|
|
data = r.json()[API_RESPONSE_FIELD_DATA]
|
2022-07-22 22:46:01 +08:00
|
|
|
return data
|
|
|
|
|
else:
|
2022-08-02 18:16:28 +08:00
|
|
|
raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
2022-07-22 22:46:01 +08:00
|
|
|
else:
|
2022-10-26 13:55:51 +08:00
|
|
|
raise_for_http_status(r)
|
2022-07-22 22:46:01 +08:00
|
|
|
return None
|
|
|
|
|
|
2025-02-06 18:22:29 +08:00
|
|
|
def _check_cookie(self, use_cookies: Union[bool, CookieJar] = False) -> CookieJar: # noqa
|
2022-06-28 21:12:15 +08:00
|
|
|
cookies = None
|
|
|
|
|
if isinstance(use_cookies, CookieJar):
|
|
|
|
|
cookies = use_cookies
|
|
|
|
|
elif use_cookies:
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
if cookies is None:
|
|
|
|
|
raise ValueError('Token does not exist, please login first.')
|
|
|
|
|
return cookies
|
|
|
|
|
|
2022-10-26 13:55:51 +08:00
|
|
|
def list_model_revisions(
|
|
|
|
|
self,
|
|
|
|
|
model_id: str,
|
2023-01-03 16:27:29 +08:00
|
|
|
cutoff_timestamp: Optional[int] = None,
|
2022-10-26 13:55:51 +08:00
|
|
|
use_cookies: Union[bool, CookieJar] = False) -> List[str]:
|
|
|
|
|
"""Get model branch and tags.
|
|
|
|
|
|
2024-01-29 09:38:23 +08:00
|
|
|
Args:
|
|
|
|
|
model_id (str): The model id
|
|
|
|
|
cutoff_timestamp (int): Tags created before the cutoff will be included.
|
|
|
|
|
The timestamp is represented by the seconds elapsed from the epoch time.
|
|
|
|
|
use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
|
|
|
|
|
will load cookie from local. Defaults to False.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Tuple[List[str], List[str]]: Return list of branch name and tags
|
|
|
|
|
"""
|
|
|
|
|
tags_details = self.list_model_revisions_detail(model_id=model_id,
|
|
|
|
|
cutoff_timestamp=cutoff_timestamp,
|
|
|
|
|
use_cookies=use_cookies)
|
|
|
|
|
tags = [x['Revision'] for x in tags_details
|
|
|
|
|
] if tags_details else []
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
|
def list_model_revisions_detail(
|
|
|
|
|
self,
|
|
|
|
|
model_id: str,
|
|
|
|
|
cutoff_timestamp: Optional[int] = None,
|
2025-03-04 12:34:11 +08:00
|
|
|
use_cookies: Union[bool, CookieJar] = False,
|
|
|
|
|
endpoint: Optional[str] = None) -> List[str]:
|
2024-01-29 09:38:23 +08:00
|
|
|
"""Get model branch and tags.
|
|
|
|
|
|
2022-10-26 13:55:51 +08:00
|
|
|
Args:
|
|
|
|
|
model_id (str): The model id
|
|
|
|
|
cutoff_timestamp (int): Tags created before the cutoff will be included.
|
2023-02-22 10:01:18 +08:00
|
|
|
The timestamp is represented by the seconds elapsed from the epoch time.
|
|
|
|
|
use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
|
2022-10-26 13:55:51 +08:00
|
|
|
will load cookie from local. Defaults to False.
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: the endpoint to use, default to None to use endpoint specified in the class
|
2023-01-03 16:27:29 +08:00
|
|
|
|
2022-10-26 13:55:51 +08:00
|
|
|
Returns:
|
|
|
|
|
Tuple[List[str], List[str]]: Return list of branch name and tags
|
|
|
|
|
"""
|
|
|
|
|
cookies = self._check_cookie(use_cookies)
|
|
|
|
|
if cutoff_timestamp is None:
|
|
|
|
|
cutoff_timestamp = get_release_datetime()
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
path = f'{endpoint}/api/v1/models/{model_id}/revisions?EndTime=%s' % cutoff_timestamp
|
2023-09-20 19:29:30 +08:00
|
|
|
r = self.session.get(path, cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2022-10-26 13:55:51 +08:00
|
|
|
handle_http_response(r, logger, cookies, model_id)
|
|
|
|
|
d = r.json()
|
|
|
|
|
raise_on_error(d)
|
|
|
|
|
info = d[API_RESPONSE_FIELD_DATA]
|
|
|
|
|
# tags returned from backend are guaranteed to be ordered by create-time
|
2024-01-29 09:38:23 +08:00
|
|
|
return info['RevisionMap']['Tags']
|
2022-10-26 13:55:51 +08:00
|
|
|
|
2024-01-29 09:38:23 +08:00
|
|
|
def get_branch_tag_detail(self, details, name):
|
|
|
|
|
for item in details:
|
|
|
|
|
if item['Revision'] == name:
|
|
|
|
|
return item
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def get_valid_revision_detail(self,
|
|
|
|
|
model_id: str,
|
|
|
|
|
revision=None,
|
2025-03-04 12:34:11 +08:00
|
|
|
cookies: Optional[CookieJar] = None,
|
|
|
|
|
endpoint: Optional[str] = None):
|
|
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
2022-10-26 13:55:51 +08:00
|
|
|
release_timestamp = get_release_datetime()
|
|
|
|
|
current_timestamp = int(round(datetime.datetime.now().timestamp()))
|
|
|
|
|
# for active development in library codes (non-release-branches), release_timestamp
|
|
|
|
|
# is set to be a far-away-time-in-the-future, to ensure that we shall
|
|
|
|
|
# get the master-HEAD version from model repo by default (when no revision is provided)
|
2024-01-29 09:38:23 +08:00
|
|
|
all_branches_detail, all_tags_detail = self.get_model_branches_and_tags_details(
|
2025-03-04 12:34:11 +08:00
|
|
|
model_id, use_cookies=False if cookies is None else cookies, endpoint=endpoint)
|
2024-01-29 09:38:23 +08:00
|
|
|
all_branches = [x['Revision'] for x in all_branches_detail] if all_branches_detail else []
|
|
|
|
|
all_tags = [x['Revision'] for x in all_tags_detail] if all_tags_detail else []
|
2022-10-26 13:55:51 +08:00
|
|
|
if release_timestamp > current_timestamp + ONE_YEAR_SECONDS:
|
|
|
|
|
if revision is None:
|
|
|
|
|
revision = MASTER_MODEL_BRANCH
|
2023-01-03 16:27:29 +08:00
|
|
|
logger.info(
|
2025-03-11 17:05:08 +08:00
|
|
|
'Model revision not specified, using default [%s] version.'
|
2023-01-03 16:27:29 +08:00
|
|
|
% revision)
|
2024-01-29 09:38:23 +08:00
|
|
|
if revision not in all_branches and revision not in all_tags:
|
2023-01-12 10:06:12 +08:00
|
|
|
raise NotExistError('The model: %s has no revision : %s .' % (model_id, revision))
|
2024-01-29 09:38:23 +08:00
|
|
|
|
|
|
|
|
revision_detail = self.get_branch_tag_detail(all_tags_detail, revision)
|
|
|
|
|
if revision_detail is None:
|
|
|
|
|
revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
|
2024-11-29 10:35:50 +08:00
|
|
|
logger.debug('Development mode use revision: %s' % revision)
|
2022-10-26 13:55:51 +08:00
|
|
|
else:
|
2024-07-31 18:10:44 +08:00
|
|
|
if revision is not None and revision in all_branches:
|
|
|
|
|
revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
|
|
|
|
|
logger.warning('Using branch: %s as version is unstable, use with caution' % revision)
|
|
|
|
|
return revision_detail
|
|
|
|
|
|
2024-01-29 09:38:23 +08:00
|
|
|
if len(all_tags_detail) == 0: # use no revision use master as default.
|
2023-10-23 19:32:22 +08:00
|
|
|
if revision is None or revision == MASTER_MODEL_BRANCH:
|
2023-09-20 19:29:30 +08:00
|
|
|
revision = MASTER_MODEL_BRANCH
|
|
|
|
|
else:
|
2023-10-23 19:32:22 +08:00
|
|
|
raise NotExistError('The model: %s has no revision: %s !' % (model_id, revision))
|
2024-01-29 09:38:23 +08:00
|
|
|
revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
|
2022-10-26 13:55:51 +08:00
|
|
|
else:
|
2023-10-23 19:32:22 +08:00
|
|
|
if revision is None: # user not specified revision, use latest revision before release time
|
2024-01-29 09:38:23 +08:00
|
|
|
revisions_detail = [x for x in
|
2025-02-06 11:09:37 +08:00
|
|
|
all_tags_detail if
|
|
|
|
|
x['CreatedAt'] <= release_timestamp] if all_tags_detail else [] # noqa E501
|
2024-01-29 09:38:23 +08:00
|
|
|
if len(revisions_detail) > 0:
|
|
|
|
|
revision = revisions_detail[0]['Revision'] # use latest revision before release time.
|
|
|
|
|
revision_detail = revisions_detail[0]
|
2023-09-20 19:29:30 +08:00
|
|
|
else:
|
2023-12-08 14:16:37 +08:00
|
|
|
revision = MASTER_MODEL_BRANCH
|
2024-01-29 09:38:23 +08:00
|
|
|
revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
|
|
|
|
|
vl = '[%s]' % ','.join(all_tags)
|
2023-12-08 14:16:37 +08:00
|
|
|
logger.warning('Model revision should be specified from revisions: %s' % (vl))
|
2023-10-23 19:32:22 +08:00
|
|
|
logger.warning('Model revision not specified, use revision: %s' % revision)
|
|
|
|
|
else:
|
|
|
|
|
# use user-specified revision
|
2024-01-29 09:38:23 +08:00
|
|
|
if revision not in all_tags:
|
2023-10-23 19:32:22 +08:00
|
|
|
if revision == MASTER_MODEL_BRANCH:
|
|
|
|
|
logger.warning('Using the master branch is fragile, please use it with caution!')
|
2024-01-29 09:38:23 +08:00
|
|
|
revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
|
2023-10-23 19:32:22 +08:00
|
|
|
else:
|
2024-01-29 09:38:23 +08:00
|
|
|
vl = '[%s]' % ','.join(all_tags)
|
2023-10-23 19:32:22 +08:00
|
|
|
raise NotExistError('The model: %s has no revision: %s valid are: %s!' %
|
|
|
|
|
(model_id, revision, vl))
|
2024-01-29 09:38:23 +08:00
|
|
|
else:
|
|
|
|
|
revision_detail = self.get_branch_tag_detail(all_tags_detail, revision)
|
2023-10-23 19:32:22 +08:00
|
|
|
logger.info('Use user-specified model revision: %s' % revision)
|
2024-01-29 09:38:23 +08:00
|
|
|
return revision_detail
|
2022-10-26 13:55:51 +08:00
|
|
|
|
2024-01-29 09:38:23 +08:00
|
|
|
def get_valid_revision(self,
|
|
|
|
|
model_id: str,
|
|
|
|
|
revision=None,
|
2025-03-04 13:58:14 +08:00
|
|
|
cookies: Optional[CookieJar] = None,
|
|
|
|
|
endpoint: Optional[str] = None):
|
2024-01-29 09:38:23 +08:00
|
|
|
return self.get_valid_revision_detail(model_id=model_id,
|
|
|
|
|
revision=revision,
|
2025-03-04 13:58:14 +08:00
|
|
|
cookies=cookies,
|
|
|
|
|
endpoint=endpoint)['Revision']
|
2024-01-29 09:38:23 +08:00
|
|
|
|
|
|
|
|
def get_model_branches_and_tags_details(
|
2025-02-06 11:09:37 +08:00
|
|
|
self,
|
|
|
|
|
model_id: str,
|
|
|
|
|
use_cookies: Union[bool, CookieJar] = False,
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: Optional[str] = None
|
2022-06-21 20:04:25 +08:00
|
|
|
) -> Tuple[List[str], List[str]]:
|
2022-06-28 21:12:15 +08:00
|
|
|
"""Get model branch and tags.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
model_id (str): The model id
|
2023-02-22 10:01:18 +08:00
|
|
|
use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
|
2022-06-28 21:12:15 +08:00
|
|
|
will load cookie from local. Defaults to False.
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: the endpoint to use, default to None to use endpoint specified in the class
|
2023-01-03 16:27:29 +08:00
|
|
|
|
2022-06-28 21:12:15 +08:00
|
|
|
Returns:
|
2022-07-22 22:46:01 +08:00
|
|
|
Tuple[List[str], List[str]]: Return list of branch name and tags
|
2022-06-28 21:12:15 +08:00
|
|
|
"""
|
|
|
|
|
cookies = self._check_cookie(use_cookies)
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
path = f'{endpoint}/api/v1/models/{model_id}/revisions'
|
2023-09-20 19:29:30 +08:00
|
|
|
r = self.session.get(path, cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2022-07-08 20:03:23 +08:00
|
|
|
handle_http_response(r, logger, cookies, model_id)
|
2022-06-21 20:04:25 +08:00
|
|
|
d = r.json()
|
|
|
|
|
raise_on_error(d)
|
2022-08-02 18:16:28 +08:00
|
|
|
info = d[API_RESPONSE_FIELD_DATA]
|
2024-01-29 09:38:23 +08:00
|
|
|
return info['RevisionMap']['Branches'], info['RevisionMap']['Tags']
|
|
|
|
|
|
|
|
|
|
def get_model_branches_and_tags(
|
2025-02-06 11:09:37 +08:00
|
|
|
self,
|
|
|
|
|
model_id: str,
|
|
|
|
|
use_cookies: Union[bool, CookieJar] = False,
|
2024-01-29 09:38:23 +08:00
|
|
|
) -> Tuple[List[str], List[str]]:
|
|
|
|
|
"""Get model branch and tags.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
model_id (str): The model id
|
|
|
|
|
use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
|
|
|
|
|
will load cookie from local. Defaults to False.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Tuple[List[str], List[str]]: Return list of branch name and tags
|
|
|
|
|
"""
|
|
|
|
|
branches_detail, tags_detail = self.get_model_branches_and_tags_details(model_id=model_id,
|
|
|
|
|
use_cookies=use_cookies)
|
|
|
|
|
branches = [x['Revision'] for x in branches_detail
|
|
|
|
|
] if branches_detail else []
|
|
|
|
|
tags = [x['Revision'] for x in tags_detail
|
|
|
|
|
] if tags_detail else []
|
2022-06-21 20:04:25 +08:00
|
|
|
return branches, tags
|
|
|
|
|
|
2022-06-28 21:12:15 +08:00
|
|
|
def get_model_files(self,
|
|
|
|
|
model_id: str,
|
2022-07-08 20:20:42 +08:00
|
|
|
revision: Optional[str] = DEFAULT_MODEL_REVISION,
|
2022-06-28 21:12:15 +08:00
|
|
|
root: Optional[str] = None,
|
|
|
|
|
recursive: Optional[str] = False,
|
|
|
|
|
use_cookies: Union[bool, CookieJar] = False,
|
2025-03-04 12:34:11 +08:00
|
|
|
headers: Optional[dict] = {},
|
|
|
|
|
endpoint: Optional[str] = None) -> List[dict]:
|
2022-06-28 21:12:15 +08:00
|
|
|
"""List the models files.
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2022-06-28 21:12:15 +08:00
|
|
|
Args:
|
|
|
|
|
model_id (str): The model id
|
2022-07-08 20:20:42 +08:00
|
|
|
revision (Optional[str], optional): The branch or tag name.
|
2022-06-28 21:12:15 +08:00
|
|
|
root (Optional[str], optional): The root path. Defaults to None.
|
2022-07-08 20:20:42 +08:00
|
|
|
recursive (Optional[str], optional): Is recursive list files. Defaults to False.
|
|
|
|
|
use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
|
2022-06-28 21:12:15 +08:00
|
|
|
will load cookie from local. Defaults to False.
|
2022-07-08 20:20:42 +08:00
|
|
|
headers: request headers
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: the endpoint to use, default to None to use endpoint specified in the class
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2022-06-28 21:12:15 +08:00
|
|
|
Returns:
|
|
|
|
|
List[dict]: Model file list.
|
|
|
|
|
"""
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
2022-10-27 17:06:18 +08:00
|
|
|
if revision:
|
|
|
|
|
path = '%s/api/v1/models/%s/repo/files?Revision=%s&Recursive=%s' % (
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint, model_id, revision, recursive)
|
2022-10-27 17:06:18 +08:00
|
|
|
else:
|
|
|
|
|
path = '%s/api/v1/models/%s/repo/files?Recursive=%s' % (
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint, model_id, recursive)
|
2022-06-28 21:12:15 +08:00
|
|
|
cookies = self._check_cookie(use_cookies)
|
2022-06-21 20:04:25 +08:00
|
|
|
if root is not None:
|
|
|
|
|
path = path + f'&Root={root}'
|
2022-12-20 22:28:28 +08:00
|
|
|
headers = self.headers if headers is None else headers
|
2023-09-20 19:29:30 +08:00
|
|
|
headers['X-Request-ID'] = str(uuid.uuid4().hex)
|
2022-11-24 15:01:24 +08:00
|
|
|
r = self.session.get(
|
2022-12-20 22:28:28 +08:00
|
|
|
path, cookies=cookies, headers=headers)
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2022-07-08 20:03:23 +08:00
|
|
|
handle_http_response(r, logger, cookies, model_id)
|
2022-06-21 20:04:25 +08:00
|
|
|
d = r.json()
|
|
|
|
|
raise_on_error(d)
|
|
|
|
|
|
|
|
|
|
files = []
|
2022-08-02 18:16:28 +08:00
|
|
|
for file in d[API_RESPONSE_FIELD_DATA]['Files']:
|
2022-06-21 20:04:25 +08:00
|
|
|
if file['Name'] == '.gitignore' or file['Name'] == '.gitattributes':
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
files.append(file)
|
|
|
|
|
return files
|
|
|
|
|
|
2024-09-14 12:24:19 +08:00
|
|
|
def file_exists(
|
|
|
|
|
self,
|
|
|
|
|
repo_id: str,
|
|
|
|
|
filename: str,
|
|
|
|
|
*,
|
|
|
|
|
revision: Optional[str] = None,
|
|
|
|
|
):
|
|
|
|
|
"""Get if the specified file exists
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
repo_id (`str`): The repo id to use
|
2025-01-18 09:59:14 +08:00
|
|
|
filename (`str`): The queried filename, if the file exists in a sub folder,
|
|
|
|
|
please pass <sub-folder-name>/<file-name>
|
2024-09-14 12:24:19 +08:00
|
|
|
revision (`Optional[str]`): The repo revision
|
|
|
|
|
Returns:
|
|
|
|
|
The query result in bool value
|
|
|
|
|
"""
|
2025-01-18 09:59:14 +08:00
|
|
|
files = self.get_model_files(repo_id, recursive=True, revision=revision)
|
|
|
|
|
files = [file['Path'] for file in files]
|
2024-09-14 12:24:19 +08:00
|
|
|
return filename in files
|
|
|
|
|
|
2024-03-22 17:30:34 +08:00
|
|
|
def create_dataset(self,
|
|
|
|
|
dataset_name: str,
|
|
|
|
|
namespace: str,
|
|
|
|
|
chinese_name: Optional[str] = '',
|
|
|
|
|
license: Optional[str] = Licenses.APACHE_V2,
|
|
|
|
|
visibility: Optional[int] = DatasetVisibility.PUBLIC,
|
2025-03-04 12:34:11 +08:00
|
|
|
description: Optional[str] = '',
|
|
|
|
|
endpoint: Optional[str] = None, ) -> str:
|
2024-03-22 17:30:34 +08:00
|
|
|
|
|
|
|
|
if dataset_name is None or namespace is None:
|
|
|
|
|
raise InvalidParameter('dataset_name and namespace are required!')
|
|
|
|
|
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
if cookies is None:
|
|
|
|
|
raise ValueError('Token does not exist, please login first.')
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
path = f'{endpoint}/api/v1/datasets'
|
2024-03-22 17:30:34 +08:00
|
|
|
files = {
|
|
|
|
|
'Name': (None, dataset_name),
|
|
|
|
|
'ChineseName': (None, chinese_name),
|
|
|
|
|
'Owner': (None, namespace),
|
|
|
|
|
'License': (None, license),
|
|
|
|
|
'Visibility': (None, visibility),
|
|
|
|
|
'Description': (None, description)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
r = self.session.post(
|
|
|
|
|
path,
|
|
|
|
|
files=files,
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
handle_http_post_error(r, path, files)
|
|
|
|
|
raise_on_error(r.json())
|
2025-03-04 12:34:11 +08:00
|
|
|
dataset_repo_url = f'{endpoint}/datasets/{namespace}/{dataset_name}'
|
2024-03-22 17:30:34 +08:00
|
|
|
logger.info(f'Create dataset success: {dataset_repo_url}')
|
|
|
|
|
return dataset_repo_url
|
|
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
def list_datasets(self, endpoint: Optional[str] = None):
|
|
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
path = f'{endpoint}/api/v1/datasets'
|
2022-07-01 11:29:33 +08:00
|
|
|
params = {}
|
2023-09-20 19:29:30 +08:00
|
|
|
r = self.session.get(path, params=params,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2022-10-26 13:55:51 +08:00
|
|
|
raise_for_http_status(r)
|
2022-08-02 18:16:28 +08:00
|
|
|
dataset_list = r.json()[API_RESPONSE_FIELD_DATA]
|
2022-07-01 11:29:33 +08:00
|
|
|
return [x['Name'] for x in dataset_list]
|
|
|
|
|
|
2025-04-07 10:24:26 +08:00
|
|
|
def delete_dataset(self, dataset_id: str, endpoint: Optional[str] = None):
|
|
|
|
|
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
if cookies is None:
|
|
|
|
|
raise ValueError('Token does not exist, please login first.')
|
|
|
|
|
|
|
|
|
|
path = f'{endpoint}/api/v1/datasets/{dataset_id}'
|
|
|
|
|
r = self.session.delete(path,
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
|
|
|
|
raise_for_http_status(r)
|
|
|
|
|
raise_on_error(r.json())
|
|
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
def get_dataset_id_and_type(self, dataset_name: str, namespace: str, endpoint: Optional[str] = None):
|
2023-01-10 07:01:34 +08:00
|
|
|
""" Get the dataset id and type. """
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
|
2022-10-27 20:30:35 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2022-11-24 15:01:24 +08:00
|
|
|
r = self.session.get(datahub_url, cookies=cookies)
|
2022-07-01 11:29:33 +08:00
|
|
|
resp = r.json()
|
2023-11-27 20:21:00 +08:00
|
|
|
datahub_raise_on_error(datahub_url, resp, r)
|
2022-07-01 11:29:33 +08:00
|
|
|
dataset_id = resp['Data']['Id']
|
2022-07-29 12:22:48 +08:00
|
|
|
dataset_type = resp['Data']['Type']
|
2023-01-10 07:01:34 +08:00
|
|
|
return dataset_id, dataset_type
|
|
|
|
|
|
2024-03-22 17:30:34 +08:00
|
|
|
def get_dataset_infos(self,
|
|
|
|
|
dataset_hub_id: str,
|
|
|
|
|
revision: str,
|
|
|
|
|
files_metadata: bool = False,
|
|
|
|
|
timeout: float = 100,
|
2025-03-04 12:34:11 +08:00
|
|
|
recursive: str = 'True',
|
|
|
|
|
endpoint: Optional[str] = None):
|
2024-03-22 17:30:34 +08:00
|
|
|
"""
|
|
|
|
|
Get dataset infos.
|
|
|
|
|
"""
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
datahub_url = f'{endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
|
2024-03-22 17:30:34 +08:00
|
|
|
params = {'Revision': revision, 'Root': None, 'Recursive': recursive}
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
if files_metadata:
|
|
|
|
|
params['blobs'] = True
|
|
|
|
|
r = self.session.get(datahub_url, params=params, cookies=cookies, timeout=timeout)
|
|
|
|
|
resp = r.json()
|
|
|
|
|
datahub_raise_on_error(datahub_url, resp, r)
|
|
|
|
|
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
|
|
def list_repo_tree(self,
|
|
|
|
|
dataset_name: str,
|
|
|
|
|
namespace: str,
|
|
|
|
|
revision: str,
|
|
|
|
|
root_path: str,
|
2024-07-26 18:08:35 +08:00
|
|
|
recursive: bool = True,
|
|
|
|
|
page_number: int = 1,
|
2025-03-04 12:34:11 +08:00
|
|
|
page_size: int = 100,
|
|
|
|
|
endpoint: Optional[str] = None):
|
2024-03-22 17:30:34 +08:00
|
|
|
|
|
|
|
|
dataset_hub_id, dataset_type = self.get_dataset_id_and_type(
|
2025-03-11 18:22:29 +08:00
|
|
|
dataset_name=dataset_name, namespace=namespace, endpoint=endpoint)
|
2024-03-22 17:30:34 +08:00
|
|
|
|
|
|
|
|
recursive = 'True' if recursive else 'False'
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
datahub_url = f'{endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
|
2024-07-12 19:06:44 +08:00
|
|
|
params = {'Revision': revision if revision else 'master',
|
2024-07-26 18:08:35 +08:00
|
|
|
'Root': root_path if root_path else '/', 'Recursive': recursive,
|
|
|
|
|
'PageNumber': page_number, 'PageSize': page_size}
|
2024-03-22 17:30:34 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
|
|
|
|
|
r = self.session.get(datahub_url, params=params, cookies=cookies)
|
|
|
|
|
resp = r.json()
|
|
|
|
|
datahub_raise_on_error(datahub_url, resp, r)
|
|
|
|
|
|
|
|
|
|
return resp
|
|
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
def get_dataset_meta_file_list(self, dataset_name: str, namespace: str,
|
|
|
|
|
dataset_id: str, revision: str, endpoint: Optional[str] = None):
|
2023-01-10 07:01:34 +08:00
|
|
|
""" Get the meta file-list of the dataset. """
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
datahub_url = f'{endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
|
2023-01-10 07:01:34 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2023-09-20 19:29:30 +08:00
|
|
|
r = self.session.get(datahub_url,
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2022-07-01 11:29:33 +08:00
|
|
|
resp = r.json()
|
2023-11-27 20:21:00 +08:00
|
|
|
datahub_raise_on_error(datahub_url, resp, r)
|
2022-07-01 11:29:33 +08:00
|
|
|
file_list = resp['Data']
|
|
|
|
|
if file_list is None:
|
|
|
|
|
raise NotExistError(
|
|
|
|
|
f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, '
|
2022-07-08 20:20:42 +08:00
|
|
|
f'version = {revision}] dose not exist')
|
2022-07-01 11:29:33 +08:00
|
|
|
|
|
|
|
|
file_list = file_list['Files']
|
2023-01-10 07:01:34 +08:00
|
|
|
return file_list
|
|
|
|
|
|
2023-04-18 17:01:43 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def dump_datatype_file(dataset_type: int, meta_cache_dir: str):
|
|
|
|
|
"""
|
2024-07-12 19:06:44 +08:00
|
|
|
Dump the data_type as a local file, in order to get the dataset
|
|
|
|
|
formation without calling the datahub.
|
|
|
|
|
More details, please refer to the class
|
|
|
|
|
`modelscope.utils.constant.DatasetFormations`.
|
2023-04-18 17:01:43 +08:00
|
|
|
"""
|
|
|
|
|
dataset_type_file_path = os.path.join(meta_cache_dir,
|
|
|
|
|
f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
|
|
|
|
|
with open(dataset_type_file_path, 'w') as fp:
|
|
|
|
|
fp.write('*** Automatically-generated file, do not modify ***')
|
|
|
|
|
|
2023-01-10 07:01:34 +08:00
|
|
|
def get_dataset_meta_files_local_paths(self, dataset_name: str,
|
|
|
|
|
namespace: str,
|
|
|
|
|
revision: str,
|
2025-03-04 12:34:11 +08:00
|
|
|
meta_cache_dir: str, dataset_type: int, file_list: list,
|
|
|
|
|
endpoint: Optional[str] = None):
|
2022-07-01 11:29:33 +08:00
|
|
|
local_paths = defaultdict(list)
|
2022-07-29 12:22:48 +08:00
|
|
|
dataset_formation = DatasetFormations(dataset_type)
|
|
|
|
|
dataset_meta_format = DatasetMetaFormats[dataset_formation]
|
2023-01-10 07:01:34 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
|
|
|
|
|
# Dump the data_type as a local file
|
2023-04-18 17:01:43 +08:00
|
|
|
HubApi.dump_datatype_file(dataset_type=dataset_type, meta_cache_dir=meta_cache_dir)
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
2022-07-01 11:29:33 +08:00
|
|
|
for file_info in file_list:
|
|
|
|
|
file_path = file_info['Path']
|
2022-07-29 12:22:48 +08:00
|
|
|
extension = os.path.splitext(file_path)[-1]
|
|
|
|
|
if extension in dataset_meta_format:
|
2025-03-04 12:34:11 +08:00
|
|
|
datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
|
2022-07-29 12:22:48 +08:00
|
|
|
f'Revision={revision}&FilePath={file_path}'
|
2022-11-24 15:01:24 +08:00
|
|
|
r = self.session.get(datahub_url, cookies=cookies)
|
2022-10-26 13:55:51 +08:00
|
|
|
raise_for_http_status(r)
|
2023-01-10 07:01:34 +08:00
|
|
|
local_path = os.path.join(meta_cache_dir, file_path)
|
2022-07-01 11:29:33 +08:00
|
|
|
if os.path.exists(local_path):
|
|
|
|
|
logger.warning(
|
|
|
|
|
f"Reusing dataset {dataset_name}'s python file ({local_path})"
|
|
|
|
|
)
|
2022-07-29 12:22:48 +08:00
|
|
|
local_paths[extension].append(local_path)
|
2022-07-01 11:29:33 +08:00
|
|
|
continue
|
2022-07-29 12:22:48 +08:00
|
|
|
with open(local_path, 'wb') as f:
|
|
|
|
|
f.write(r.content)
|
|
|
|
|
local_paths[extension].append(local_path)
|
|
|
|
|
|
2023-01-10 07:01:34 +08:00
|
|
|
return local_paths, dataset_formation
|
2022-07-29 12:22:48 +08:00
|
|
|
|
2023-05-24 19:48:20 +08:00
|
|
|
@staticmethod
|
2023-06-27 11:58:19 +08:00
|
|
|
def fetch_meta_files_from_url(url, out_path, chunk_size=1024, mode=DownloadMode.REUSE_DATASET_IF_EXISTS):
|
|
|
|
|
"""
|
|
|
|
|
Fetch the meta-data files from the url, e.g. csv/jsonl files.
|
|
|
|
|
"""
|
2023-05-24 19:48:20 +08:00
|
|
|
import hashlib
|
2024-09-11 11:40:37 +08:00
|
|
|
from tqdm.auto import tqdm
|
2024-06-30 20:15:03 +08:00
|
|
|
import pandas as pd
|
|
|
|
|
|
2023-05-24 19:48:20 +08:00
|
|
|
out_path = os.path.join(out_path, hashlib.md5(url.encode(encoding='UTF-8')).hexdigest())
|
2023-06-15 15:42:21 +08:00
|
|
|
if mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(out_path):
|
|
|
|
|
os.remove(out_path)
|
2023-05-24 19:48:20 +08:00
|
|
|
if os.path.exists(out_path):
|
2023-06-27 11:58:19 +08:00
|
|
|
logger.info(f'Reusing cached meta-data file: {out_path}')
|
2023-05-24 19:48:20 +08:00
|
|
|
return out_path
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
|
|
|
|
|
# Make the request and get the response content as TextIO
|
2023-06-27 11:58:19 +08:00
|
|
|
logger.info('Loading meta-data file ...')
|
|
|
|
|
response = requests.get(url, cookies=cookies, stream=True)
|
|
|
|
|
total_size = int(response.headers.get('content-length', 0))
|
|
|
|
|
progress = tqdm(total=total_size, dynamic_ncols=True)
|
|
|
|
|
|
|
|
|
|
def get_chunk(resp):
|
|
|
|
|
chunk_data = []
|
|
|
|
|
for data in resp.iter_lines():
|
|
|
|
|
data = data.decode('utf-8')
|
|
|
|
|
chunk_data.append(data)
|
|
|
|
|
if len(chunk_data) >= chunk_size:
|
|
|
|
|
yield chunk_data
|
|
|
|
|
chunk_data = []
|
|
|
|
|
yield chunk_data
|
2023-05-24 19:48:20 +08:00
|
|
|
|
|
|
|
|
iter_num = 0
|
2023-06-27 11:58:19 +08:00
|
|
|
with open(out_path, 'a') as f:
|
|
|
|
|
for chunk in get_chunk(response):
|
|
|
|
|
progress.update(len(chunk))
|
|
|
|
|
if url.endswith('jsonl'):
|
|
|
|
|
chunk = [json.loads(line) for line in chunk if line.strip()]
|
|
|
|
|
if len(chunk) == 0:
|
|
|
|
|
continue
|
|
|
|
|
if iter_num == 0:
|
|
|
|
|
with_header = True
|
|
|
|
|
else:
|
|
|
|
|
with_header = False
|
|
|
|
|
chunk_df = pd.DataFrame(chunk)
|
2024-03-22 17:30:34 +08:00
|
|
|
chunk_df.to_csv(f, index=False, header=with_header, escapechar='\\')
|
2023-06-27 11:58:19 +08:00
|
|
|
iter_num += 1
|
2023-05-24 19:48:20 +08:00
|
|
|
else:
|
2023-06-27 11:58:19 +08:00
|
|
|
# csv or others
|
|
|
|
|
for line in chunk:
|
|
|
|
|
f.write(line + '\n')
|
|
|
|
|
progress.close()
|
2023-05-24 19:48:20 +08:00
|
|
|
|
|
|
|
|
return out_path
|
|
|
|
|
|
2022-07-29 12:22:48 +08:00
|
|
|
def get_dataset_file_url(
|
|
|
|
|
self,
|
|
|
|
|
file_name: str,
|
|
|
|
|
dataset_name: str,
|
|
|
|
|
namespace: str,
|
2024-03-22 17:30:34 +08:00
|
|
|
revision: Optional[str] = DEFAULT_DATASET_REVISION,
|
2024-07-12 19:06:44 +08:00
|
|
|
view: Optional[bool] = False,
|
2025-03-04 12:34:11 +08:00
|
|
|
extension_filter: Optional[bool] = True,
|
|
|
|
|
endpoint: Optional[str] = None):
|
2024-03-22 17:30:34 +08:00
|
|
|
|
|
|
|
|
if not file_name or not dataset_name or not namespace:
|
|
|
|
|
raise ValueError('Args (file_name, dataset_name, namespace) cannot be empty!')
|
|
|
|
|
|
|
|
|
|
# Note: make sure the FilePath is the last parameter in the url
|
2024-07-12 19:06:44 +08:00
|
|
|
params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': file_name, 'View': view}
|
2024-03-22 17:30:34 +08:00
|
|
|
params: str = urlencode(params)
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
file_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?{params}'
|
2024-03-22 17:30:34 +08:00
|
|
|
|
|
|
|
|
return file_url
|
|
|
|
|
|
|
|
|
|
# if extension_filter:
|
|
|
|
|
# if os.path.splitext(file_name)[-1] in META_FILES_FORMAT:
|
|
|
|
|
# file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'\
|
|
|
|
|
# f'Revision={revision}&FilePath={file_name}'
|
|
|
|
|
# else:
|
|
|
|
|
# file_url = file_name
|
|
|
|
|
# return file_url
|
|
|
|
|
# else:
|
|
|
|
|
# return file_url
|
2022-07-29 12:22:48 +08:00
|
|
|
|
2024-03-26 15:13:25 +08:00
|
|
|
def get_dataset_file_url_origin(
|
|
|
|
|
self,
|
|
|
|
|
file_name: str,
|
|
|
|
|
dataset_name: str,
|
|
|
|
|
namespace: str,
|
2025-03-04 12:34:11 +08:00
|
|
|
revision: Optional[str] = DEFAULT_DATASET_REVISION,
|
|
|
|
|
endpoint: Optional[str] = None):
|
|
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
2024-03-26 15:13:25 +08:00
|
|
|
if file_name and os.path.splitext(file_name)[-1] in META_FILES_FORMAT:
|
2025-03-04 12:34:11 +08:00
|
|
|
file_name = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
|
2024-03-26 15:13:25 +08:00
|
|
|
f'Revision={revision}&FilePath={file_name}'
|
|
|
|
|
return file_name
|
|
|
|
|
|
2022-07-29 12:22:48 +08:00
|
|
|
def get_dataset_access_config(
|
|
|
|
|
self,
|
|
|
|
|
dataset_name: str,
|
|
|
|
|
namespace: str,
|
2025-03-04 12:34:11 +08:00
|
|
|
revision: Optional[str] = DEFAULT_DATASET_REVISION,
|
|
|
|
|
endpoint: Optional[str] = None):
|
|
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
|
2022-07-29 12:22:48 +08:00
|
|
|
f'ststoken?Revision={revision}'
|
|
|
|
|
return self.datahub_remote_call(datahub_url)
|
|
|
|
|
|
2022-08-25 22:28:10 +08:00
|
|
|
def get_dataset_access_config_session(
|
|
|
|
|
self,
|
|
|
|
|
dataset_name: str,
|
|
|
|
|
namespace: str,
|
2023-01-10 07:01:34 +08:00
|
|
|
check_cookie: bool,
|
2025-03-04 12:34:11 +08:00
|
|
|
revision: Optional[str] = DEFAULT_DATASET_REVISION,
|
|
|
|
|
endpoint: Optional[str] = None):
|
2022-08-25 22:28:10 +08:00
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
|
2022-08-25 22:28:10 +08:00
|
|
|
f'ststoken?Revision={revision}'
|
2023-01-10 07:01:34 +08:00
|
|
|
if check_cookie:
|
|
|
|
|
cookies = self._check_cookie(use_cookies=True)
|
|
|
|
|
else:
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2022-08-25 22:28:10 +08:00
|
|
|
|
2023-01-03 16:27:29 +08:00
|
|
|
r = self.session.get(
|
2023-09-20 19:29:30 +08:00
|
|
|
url=datahub_url,
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2022-08-25 22:28:10 +08:00
|
|
|
resp = r.json()
|
2022-09-14 19:24:48 +08:00
|
|
|
raise_on_error(resp)
|
2022-08-25 22:28:10 +08:00
|
|
|
return resp['Data']
|
|
|
|
|
|
2023-04-18 17:01:43 +08:00
|
|
|
def get_virgo_meta(self, dataset_id: str, version: int = 1) -> dict:
|
|
|
|
|
"""
|
|
|
|
|
Get virgo dataset meta info.
|
|
|
|
|
"""
|
|
|
|
|
virgo_endpoint = os.environ.get(VirgoDatasetConfig.env_virgo_endpoint, '')
|
|
|
|
|
if not virgo_endpoint:
|
|
|
|
|
raise RuntimeError(f'Virgo endpoint is not set in env: {VirgoDatasetConfig.env_virgo_endpoint}')
|
|
|
|
|
|
|
|
|
|
virgo_dataset_url = f'{virgo_endpoint}/data/set/download'
|
|
|
|
|
cookies = requests.utils.dict_from_cookiejar(ModelScopeConfig.get_cookies())
|
|
|
|
|
|
|
|
|
|
dataset_info = dict(
|
|
|
|
|
dataSetId=dataset_id,
|
|
|
|
|
dataSetVersion=version
|
|
|
|
|
)
|
|
|
|
|
data = dict(
|
|
|
|
|
data=dataset_info,
|
|
|
|
|
)
|
2023-09-20 19:29:30 +08:00
|
|
|
r = self.session.post(url=virgo_dataset_url,
|
|
|
|
|
json=data,
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers),
|
|
|
|
|
timeout=900)
|
2023-04-18 17:01:43 +08:00
|
|
|
resp = r.json()
|
|
|
|
|
if resp['code'] != 0:
|
|
|
|
|
raise RuntimeError(f'Failed to get virgo dataset: {resp}')
|
|
|
|
|
|
|
|
|
|
return resp['data']
|
|
|
|
|
|
2023-01-10 07:01:34 +08:00
|
|
|
def get_dataset_access_config_for_unzipped(self,
|
|
|
|
|
dataset_name: str,
|
|
|
|
|
namespace: str,
|
|
|
|
|
revision: str,
|
2025-03-04 12:34:11 +08:00
|
|
|
zip_file_name: str,
|
|
|
|
|
endpoint: Optional[str] = None):
|
|
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
|
2023-01-10 07:01:34 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2023-09-20 19:29:30 +08:00
|
|
|
r = self.session.get(url=datahub_url, cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2023-01-10 07:01:34 +08:00
|
|
|
resp = r.json()
|
|
|
|
|
# get visibility of the dataset
|
|
|
|
|
raise_on_error(resp)
|
|
|
|
|
data = resp['Data']
|
2025-02-11 20:37:14 +08:00
|
|
|
visibility = VisibilityMap.get(data['Visibility'])
|
2023-01-10 07:01:34 +08:00
|
|
|
|
|
|
|
|
datahub_sts_url = f'{datahub_url}/ststoken?Revision={revision}'
|
2023-09-20 19:29:30 +08:00
|
|
|
r_sts = self.session.get(url=datahub_sts_url, cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2023-01-10 07:01:34 +08:00
|
|
|
resp_sts = r_sts.json()
|
|
|
|
|
raise_on_error(resp_sts)
|
|
|
|
|
data_sts = resp_sts['Data']
|
|
|
|
|
file_dir = visibility + '-unzipped' + '/' + namespace + '_' + dataset_name + '_' + zip_file_name
|
|
|
|
|
data_sts['Dir'] = file_dir
|
|
|
|
|
return data_sts
|
|
|
|
|
|
2022-10-14 18:32:38 +08:00
|
|
|
def list_oss_dataset_objects(self, dataset_name, namespace, max_limit,
|
2025-03-04 12:34:11 +08:00
|
|
|
is_recursive, is_filter_dir, revision, endpoint: Optional[str] = None):
|
|
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \
|
2025-02-06 11:09:37 +08:00
|
|
|
f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'
|
2022-10-19 10:09:06 +08:00
|
|
|
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2022-12-14 21:21:01 +08:00
|
|
|
resp = self.session.get(url=url, cookies=cookies, timeout=1800)
|
2022-10-14 18:32:38 +08:00
|
|
|
resp = resp.json()
|
|
|
|
|
raise_on_error(resp)
|
|
|
|
|
resp = resp['Data']
|
|
|
|
|
return resp
|
|
|
|
|
|
2022-10-27 20:30:35 +08:00
|
|
|
def delete_oss_dataset_object(self, object_name: str, dataset_name: str,
|
2025-03-04 12:34:11 +08:00
|
|
|
namespace: str, revision: str, endpoint: Optional[str] = None) -> str:
|
2022-10-27 20:30:35 +08:00
|
|
|
if not object_name or not dataset_name or not namespace or not revision:
|
|
|
|
|
raise ValueError('Args cannot be empty!')
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss?Path={object_name}&Revision={revision}'
|
2022-10-27 20:30:35 +08:00
|
|
|
|
2023-01-10 07:01:34 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2022-11-24 15:01:24 +08:00
|
|
|
resp = self.session.delete(url=url, cookies=cookies)
|
2022-10-27 20:30:35 +08:00
|
|
|
resp = resp.json()
|
|
|
|
|
raise_on_error(resp)
|
|
|
|
|
resp = resp['Message']
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
|
|
def delete_oss_dataset_dir(self, object_name: str, dataset_name: str,
|
2025-03-04 12:34:11 +08:00
|
|
|
namespace: str, revision: str, endpoint: Optional[str] = None) -> str:
|
2022-10-27 20:30:35 +08:00
|
|
|
if not object_name or not dataset_name or not namespace or not revision:
|
|
|
|
|
raise ValueError('Args cannot be empty!')
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/prefix?Prefix={object_name}/' \
|
2025-02-06 11:09:37 +08:00
|
|
|
f'&Revision={revision}'
|
2022-10-27 20:30:35 +08:00
|
|
|
|
2023-01-10 07:01:34 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2022-11-24 15:01:24 +08:00
|
|
|
resp = self.session.delete(url=url, cookies=cookies)
|
2022-10-27 20:30:35 +08:00
|
|
|
resp = resp.json()
|
|
|
|
|
raise_on_error(resp)
|
|
|
|
|
resp = resp['Message']
|
|
|
|
|
return resp
|
|
|
|
|
|
2022-11-24 15:01:24 +08:00
|
|
|
def datahub_remote_call(self, url):
|
2022-10-27 20:30:35 +08:00
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
2023-01-03 16:27:29 +08:00
|
|
|
r = self.session.get(
|
|
|
|
|
url,
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
headers={'user-agent': ModelScopeConfig.get_user_agent()})
|
2022-07-29 12:22:48 +08:00
|
|
|
resp = r.json()
|
2023-11-27 20:21:00 +08:00
|
|
|
datahub_raise_on_error(url, resp, r)
|
2022-07-29 12:22:48 +08:00
|
|
|
return resp['Data']
|
2022-07-01 11:29:33 +08:00
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
def dataset_download_statistics(self, dataset_name: str, namespace: str,
|
|
|
|
|
use_streaming: bool = False, endpoint: Optional[str] = None) -> None:
|
2023-01-10 07:01:34 +08:00
|
|
|
is_ci_test = os.getenv('CI_TEST') == 'True'
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
2023-01-10 07:01:34 +08:00
|
|
|
if dataset_name and namespace and not is_ci_test and not use_streaming:
|
|
|
|
|
try:
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
|
|
|
|
|
# Download count
|
2025-03-04 12:34:11 +08:00
|
|
|
download_count_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
|
2023-09-20 19:29:30 +08:00
|
|
|
download_count_resp = self.session.post(download_count_url, cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2023-01-10 07:01:34 +08:00
|
|
|
raise_for_http_status(download_count_resp)
|
|
|
|
|
|
|
|
|
|
# Download uv
|
|
|
|
|
channel = DownloadChannel.LOCAL.value
|
|
|
|
|
user_name = ''
|
|
|
|
|
if MODELSCOPE_CLOUD_ENVIRONMENT in os.environ:
|
|
|
|
|
channel = os.environ[MODELSCOPE_CLOUD_ENVIRONMENT]
|
|
|
|
|
if MODELSCOPE_CLOUD_USERNAME in os.environ:
|
|
|
|
|
user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
|
2025-03-04 12:34:11 +08:00
|
|
|
download_uv_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/' \
|
2023-01-10 07:01:34 +08:00
|
|
|
f'{channel}?user={user_name}'
|
2023-09-20 19:29:30 +08:00
|
|
|
download_uv_resp = self.session.post(download_uv_url, cookies=cookies,
|
|
|
|
|
headers=self.builder_headers(self.headers))
|
2023-01-10 07:01:34 +08:00
|
|
|
download_uv_resp = download_uv_resp.json()
|
|
|
|
|
raise_on_error(download_uv_resp)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(e)
|
2022-11-01 12:57:04 +08:00
|
|
|
|
2023-09-20 19:29:30 +08:00
|
|
|
def builder_headers(self, headers):
|
|
|
|
|
return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
|
|
|
|
|
**headers}
|
|
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
def get_file_base_path(self, repo_id: str, endpoint: Optional[str] = None) -> str:
|
2025-01-02 17:06:46 +08:00
|
|
|
_namespace, _dataset_name = repo_id.split('/')
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
return f'{endpoint}/api/v1/datasets/{_namespace}/{_dataset_name}/repo?'
|
2024-03-22 17:30:34 +08:00
|
|
|
# return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='
|
|
|
|
|
|
2025-01-14 23:59:53 +08:00
|
|
|
def create_repo(
|
|
|
|
|
self,
|
|
|
|
|
repo_id: str,
|
|
|
|
|
*,
|
|
|
|
|
token: Union[str, bool, None] = None,
|
2025-02-11 20:37:14 +08:00
|
|
|
visibility: Optional[str] = Visibility.PUBLIC,
|
2025-01-14 23:59:53 +08:00
|
|
|
repo_type: Optional[str] = REPO_TYPE_MODEL,
|
|
|
|
|
chinese_name: Optional[str] = '',
|
|
|
|
|
license: Optional[str] = Licenses.APACHE_V2,
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: Optional[str] = None,
|
2025-04-07 10:24:26 +08:00
|
|
|
exist_ok: Optional[bool] = False,
|
2025-02-06 11:09:37 +08:00
|
|
|
**kwargs,
|
2025-01-14 23:59:53 +08:00
|
|
|
) -> str:
|
2025-04-07 10:24:26 +08:00
|
|
|
"""
|
|
|
|
|
Create a repository on the ModelScope Hub.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
repo_id (str): The repo id in the format of `owner_name/repo_name`.
|
|
|
|
|
token (Union[str, bool, None]): The access token.
|
|
|
|
|
visibility (Optional[str]): The visibility of the repo,
|
|
|
|
|
could be `public`, `private`, `internal`, default to `public`.
|
|
|
|
|
repo_type (Optional[str]): The repo type, default to `model`.
|
|
|
|
|
chinese_name (Optional[str]): The Chinese name of the repo.
|
|
|
|
|
license (Optional[str]): The license of the repo, default to `apache-2.0`.
|
|
|
|
|
endpoint (Optional[str]): The endpoint to use.
|
|
|
|
|
In the format of `https://www.modelscope.cn` or 'https://www.modelscope.ai'
|
|
|
|
|
exist_ok (Optional[bool]): If the repo exists, whether to return the repo url directly.
|
|
|
|
|
**kwargs: The additional arguments.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: The repo url.
|
|
|
|
|
"""
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
if not repo_id:
|
|
|
|
|
raise ValueError('Repo id cannot be empty!')
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
2025-04-07 10:24:26 +08:00
|
|
|
|
|
|
|
|
repo_exists: bool = self.repo_exists(repo_id, repo_type=repo_type, endpoint=endpoint)
|
|
|
|
|
if repo_exists:
|
|
|
|
|
if exist_ok:
|
|
|
|
|
return f'{endpoint}/{repo_type}s/{repo_id}'
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f'Repo {repo_id} already exists!')
|
|
|
|
|
|
|
|
|
|
self.login(access_token=token, endpoint=endpoint)
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
repo_id_list = repo_id.split('/')
|
|
|
|
|
if len(repo_id_list) != 2:
|
|
|
|
|
raise ValueError('Invalid repo id, should be in the format of `owner_name/repo_name`')
|
|
|
|
|
namespace, repo_name = repo_id_list
|
|
|
|
|
|
|
|
|
|
if repo_type == REPO_TYPE_MODEL:
|
|
|
|
|
visibilities = {k: v for k, v in ModelVisibility.__dict__.items() if not k.startswith('__')}
|
|
|
|
|
visibility: int = visibilities.get(visibility.upper())
|
|
|
|
|
if visibility is None:
|
|
|
|
|
raise ValueError(f'Invalid visibility: {visibility}, '
|
|
|
|
|
f'supported visibilities: `public`, `private`, `internal`')
|
2025-04-07 10:24:26 +08:00
|
|
|
repo_url: str = self.create_model(
|
|
|
|
|
model_id=repo_id,
|
|
|
|
|
visibility=visibility,
|
|
|
|
|
license=license,
|
|
|
|
|
chinese_name=chinese_name,
|
|
|
|
|
)
|
|
|
|
|
with tempfile.TemporaryDirectory() as temp_cache_dir:
|
|
|
|
|
from modelscope.hub.repository import Repository
|
|
|
|
|
repo = Repository(temp_cache_dir, repo_id)
|
|
|
|
|
default_config = {
|
|
|
|
|
'framework': 'pytorch',
|
|
|
|
|
'task': 'text-generation',
|
|
|
|
|
'allow_remote': True
|
|
|
|
|
}
|
|
|
|
|
config_json = kwargs.get('config_json')
|
|
|
|
|
if not config_json:
|
|
|
|
|
config_json = {}
|
|
|
|
|
config = {**default_config, **config_json}
|
|
|
|
|
add_content_to_file(
|
|
|
|
|
repo,
|
|
|
|
|
'configuration.json', [json.dumps(config)],
|
|
|
|
|
ignore_push_error=True)
|
2025-02-06 11:09:37 +08:00
|
|
|
|
2025-01-14 23:59:53 +08:00
|
|
|
elif repo_type == REPO_TYPE_DATASET:
|
|
|
|
|
visibilities = {k: v for k, v in DatasetVisibility.__dict__.items() if not k.startswith('__')}
|
|
|
|
|
visibility: int = visibilities.get(visibility.upper())
|
|
|
|
|
if visibility is None:
|
|
|
|
|
raise ValueError(f'Invalid visibility: {visibility}, '
|
|
|
|
|
f'supported visibilities: `public`, `private`, `internal`')
|
2025-04-07 10:24:26 +08:00
|
|
|
repo_url: str = self.create_dataset(
|
|
|
|
|
dataset_name=repo_name,
|
|
|
|
|
namespace=namespace,
|
|
|
|
|
chinese_name=chinese_name,
|
|
|
|
|
license=license,
|
|
|
|
|
visibility=visibility,
|
|
|
|
|
)
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
|
|
|
|
|
|
2025-04-07 10:24:26 +08:00
|
|
|
logger.info(f'Repo created: {repo_url}')
|
|
|
|
|
|
2025-01-14 23:59:53 +08:00
|
|
|
return repo_url
|
|
|
|
|
|
|
|
|
|
def create_commit(
|
|
|
|
|
self,
|
|
|
|
|
repo_id: str,
|
|
|
|
|
operations: Iterable[CommitOperation],
|
|
|
|
|
*,
|
|
|
|
|
commit_message: str,
|
|
|
|
|
commit_description: Optional[str] = None,
|
|
|
|
|
token: str = None,
|
|
|
|
|
repo_type: Optional[str] = None,
|
|
|
|
|
revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: Optional[str] = None
|
2025-01-14 23:59:53 +08:00
|
|
|
) -> CommitInfo:
|
|
|
|
|
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
url = f'{endpoint}/api/v1/repos/{repo_type}s/{repo_id}/commit/{revision}'
|
2025-01-14 23:59:53 +08:00
|
|
|
commit_message = commit_message or f'Commit to {repo_id}'
|
|
|
|
|
commit_description = commit_description or ''
|
|
|
|
|
|
2025-02-06 18:22:29 +08:00
|
|
|
self.login(access_token=token)
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
# Construct payload
|
|
|
|
|
payload = self._prepare_commit_payload(
|
|
|
|
|
operations=operations,
|
|
|
|
|
commit_message=commit_message,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# POST
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
if cookies is None:
|
|
|
|
|
raise ValueError('Token does not exist, please login first.')
|
|
|
|
|
response = requests.post(
|
|
|
|
|
url,
|
|
|
|
|
headers=self.builder_headers(self.headers),
|
|
|
|
|
data=json.dumps(payload),
|
|
|
|
|
cookies=cookies
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
resp = response.json()
|
|
|
|
|
|
|
|
|
|
if not resp['Success']:
|
|
|
|
|
commit_message = resp['Message']
|
|
|
|
|
logger.warning(f'{commit_message}')
|
|
|
|
|
|
|
|
|
|
return CommitInfo(
|
|
|
|
|
commit_url=url,
|
|
|
|
|
commit_message=commit_message,
|
|
|
|
|
commit_description=commit_description,
|
|
|
|
|
oid='',
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def upload_file(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
path_or_fileobj: Union[str, Path, bytes, BinaryIO],
|
|
|
|
|
path_in_repo: str,
|
|
|
|
|
repo_id: str,
|
|
|
|
|
token: Union[str, None] = None,
|
|
|
|
|
repo_type: Optional[str] = REPO_TYPE_MODEL,
|
|
|
|
|
commit_message: Optional[str] = None,
|
|
|
|
|
commit_description: Optional[str] = None,
|
|
|
|
|
buffer_size_mb: Optional[int] = 1,
|
|
|
|
|
tqdm_desc: Optional[str] = '[Uploading]',
|
|
|
|
|
disable_tqdm: Optional[bool] = False,
|
|
|
|
|
) -> CommitInfo:
|
|
|
|
|
|
|
|
|
|
if repo_type not in REPO_TYPE_SUPPORT:
|
|
|
|
|
raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
|
|
|
|
|
|
|
|
|
|
if not path_or_fileobj:
|
|
|
|
|
raise ValueError('Path or file object cannot be empty!')
|
|
|
|
|
|
|
|
|
|
if isinstance(path_or_fileobj, (str, Path)):
|
|
|
|
|
path_or_fileobj = os.path.abspath(os.path.expanduser(path_or_fileobj))
|
|
|
|
|
path_in_repo = path_in_repo or os.path.basename(path_or_fileobj)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
# If path_or_fileobj is bytes or BinaryIO, then path_in_repo must be provided
|
|
|
|
|
if not path_in_repo:
|
|
|
|
|
raise ValueError('Arg `path_in_repo` cannot be empty!')
|
|
|
|
|
|
|
|
|
|
# Read file content if path_or_fileobj is a file-like object (BinaryIO)
|
|
|
|
|
# TODO: to be refined
|
|
|
|
|
if isinstance(path_or_fileobj, io.BufferedIOBase):
|
|
|
|
|
path_or_fileobj = path_or_fileobj.read()
|
|
|
|
|
|
|
|
|
|
self.upload_checker.check_file(path_or_fileobj)
|
|
|
|
|
self.upload_checker.check_normal_files(
|
|
|
|
|
file_path_list=[path_or_fileobj],
|
|
|
|
|
repo_type=repo_type,
|
|
|
|
|
)
|
|
|
|
|
|
2025-02-06 18:22:29 +08:00
|
|
|
self.login(access_token=token)
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
commit_message = (
|
|
|
|
|
commit_message if commit_message is not None else f'Upload {path_in_repo} to ModelScope hub'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if buffer_size_mb <= 0:
|
|
|
|
|
raise ValueError('Buffer size: `buffer_size_mb` must be greater than 0')
|
|
|
|
|
|
|
|
|
|
hash_info_d: dict = get_file_hash(
|
|
|
|
|
file_path_or_obj=path_or_fileobj,
|
|
|
|
|
buffer_size_mb=buffer_size_mb,
|
|
|
|
|
)
|
|
|
|
|
file_size: int = hash_info_d['file_size']
|
|
|
|
|
file_hash: str = hash_info_d['file_hash']
|
|
|
|
|
|
|
|
|
|
upload_res: dict = self._upload_blob(
|
|
|
|
|
repo_id=repo_id,
|
|
|
|
|
repo_type=repo_type,
|
|
|
|
|
sha256=file_hash,
|
|
|
|
|
size=file_size,
|
|
|
|
|
data=path_or_fileobj,
|
|
|
|
|
disable_tqdm=disable_tqdm,
|
|
|
|
|
tqdm_desc=tqdm_desc,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Construct commit info and create commit
|
|
|
|
|
add_operation: CommitOperationAdd = CommitOperationAdd(
|
|
|
|
|
path_in_repo=path_in_repo,
|
|
|
|
|
path_or_fileobj=path_or_fileobj,
|
2025-02-05 20:09:18 +08:00
|
|
|
file_hash_info=hash_info_d,
|
2025-01-14 23:59:53 +08:00
|
|
|
)
|
|
|
|
|
add_operation._upload_mode = 'lfs' if self.upload_checker.is_lfs(path_or_fileobj, repo_type) else 'normal'
|
|
|
|
|
add_operation._is_uploaded = upload_res['is_uploaded']
|
|
|
|
|
operations = [add_operation]
|
|
|
|
|
|
2025-02-05 20:09:18 +08:00
|
|
|
print(f'Committing file to {repo_id} ...')
|
2025-01-14 23:59:53 +08:00
|
|
|
commit_info: CommitInfo = self.create_commit(
|
|
|
|
|
repo_id=repo_id,
|
|
|
|
|
operations=operations,
|
|
|
|
|
commit_message=commit_message,
|
|
|
|
|
commit_description=commit_description,
|
|
|
|
|
token=token,
|
|
|
|
|
repo_type=repo_type,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return commit_info
|
|
|
|
|
|
|
|
|
|
def upload_folder(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
repo_id: str,
|
2025-02-06 18:22:29 +08:00
|
|
|
folder_path: Union[str, Path, List[str], List[Path]] = None,
|
2025-01-14 23:59:53 +08:00
|
|
|
path_in_repo: Optional[str] = '',
|
|
|
|
|
commit_message: Optional[str] = None,
|
|
|
|
|
commit_description: Optional[str] = None,
|
|
|
|
|
token: Union[str, None] = None,
|
|
|
|
|
repo_type: Optional[str] = REPO_TYPE_MODEL,
|
|
|
|
|
allow_patterns: Optional[Union[List[str], str]] = None,
|
|
|
|
|
ignore_patterns: Optional[Union[List[str], str]] = None,
|
|
|
|
|
max_workers: int = DEFAULT_MAX_WORKERS,
|
2025-02-06 18:22:29 +08:00
|
|
|
revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
|
2025-01-14 23:59:53 +08:00
|
|
|
) -> CommitInfo:
|
|
|
|
|
if repo_type not in REPO_TYPE_SUPPORT:
|
|
|
|
|
raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
|
|
|
|
|
|
|
|
|
|
allow_patterns = allow_patterns if allow_patterns else None
|
|
|
|
|
ignore_patterns = ignore_patterns if ignore_patterns else None
|
|
|
|
|
|
|
|
|
|
# Ignore .git folder
|
|
|
|
|
if ignore_patterns is None:
|
|
|
|
|
ignore_patterns = []
|
|
|
|
|
elif isinstance(ignore_patterns, str):
|
|
|
|
|
ignore_patterns = [ignore_patterns]
|
|
|
|
|
ignore_patterns += DEFAULT_IGNORE_PATTERNS
|
|
|
|
|
|
2025-02-06 18:22:29 +08:00
|
|
|
self.login(access_token=token)
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
commit_message = (
|
2025-02-06 18:22:29 +08:00
|
|
|
commit_message if commit_message is not None else f'Upload to {repo_id} on ModelScope hub'
|
2025-01-14 23:59:53 +08:00
|
|
|
)
|
2025-02-06 18:22:29 +08:00
|
|
|
commit_description = commit_description or 'Uploading files'
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
# Get the list of files to upload, e.g. [('data/abc.png', '/path/to/abc.png'), ...]
|
2025-02-06 18:22:29 +08:00
|
|
|
prepared_repo_objects = self._prepare_upload_folder(
|
|
|
|
|
folder_path_or_files=folder_path,
|
2025-01-14 23:59:53 +08:00
|
|
|
path_in_repo=path_in_repo,
|
|
|
|
|
allow_patterns=allow_patterns,
|
|
|
|
|
ignore_patterns=ignore_patterns,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
self.upload_checker.check_normal_files(
|
2025-02-06 18:22:29 +08:00
|
|
|
file_path_list=[item for _, item in prepared_repo_objects],
|
2025-01-14 23:59:53 +08:00
|
|
|
repo_type=repo_type,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@thread_executor(max_workers=max_workers, disable_tqdm=False)
|
|
|
|
|
def _upload_items(item_pair, **kwargs):
|
|
|
|
|
file_path_in_repo, file_path = item_pair
|
|
|
|
|
|
|
|
|
|
hash_info_d: dict = get_file_hash(
|
|
|
|
|
file_path_or_obj=file_path,
|
|
|
|
|
)
|
|
|
|
|
file_size: int = hash_info_d['file_size']
|
|
|
|
|
file_hash: str = hash_info_d['file_hash']
|
|
|
|
|
|
|
|
|
|
upload_res: dict = self._upload_blob(
|
|
|
|
|
repo_id=repo_id,
|
|
|
|
|
repo_type=repo_type,
|
|
|
|
|
sha256=file_hash,
|
|
|
|
|
size=file_size,
|
|
|
|
|
data=file_path,
|
2025-01-15 13:45:16 +08:00
|
|
|
disable_tqdm=False if file_size > 5 * 1024 * 1024 else True,
|
|
|
|
|
tqdm_desc='[Uploading ' + file_path_in_repo + ']',
|
2025-01-14 23:59:53 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
'file_path_in_repo': file_path_in_repo,
|
|
|
|
|
'file_path': file_path,
|
|
|
|
|
'is_uploaded': upload_res['is_uploaded'],
|
2025-02-05 20:09:18 +08:00
|
|
|
'file_hash_info': hash_info_d,
|
2025-01-14 23:59:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uploaded_items_list = _upload_items(
|
|
|
|
|
prepared_repo_objects,
|
|
|
|
|
repo_id=repo_id,
|
|
|
|
|
token=token,
|
|
|
|
|
repo_type=repo_type,
|
|
|
|
|
commit_message=commit_message,
|
|
|
|
|
commit_description=commit_description,
|
|
|
|
|
buffer_size_mb=1,
|
|
|
|
|
disable_tqdm=False,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Construct commit info and create commit
|
|
|
|
|
operations = []
|
|
|
|
|
|
|
|
|
|
for item_d in uploaded_items_list:
|
|
|
|
|
prepared_path_in_repo: str = item_d['file_path_in_repo']
|
|
|
|
|
prepared_file_path: str = item_d['file_path']
|
|
|
|
|
is_uploaded: bool = item_d['is_uploaded']
|
2025-02-05 20:09:18 +08:00
|
|
|
file_hash_info: dict = item_d['file_hash_info']
|
2025-01-14 23:59:53 +08:00
|
|
|
opt = CommitOperationAdd(
|
|
|
|
|
path_in_repo=prepared_path_in_repo,
|
|
|
|
|
path_or_fileobj=prepared_file_path,
|
2025-02-05 20:09:18 +08:00
|
|
|
file_hash_info=file_hash_info,
|
2025-01-14 23:59:53 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# check normal or lfs
|
|
|
|
|
opt._upload_mode = 'lfs' if self.upload_checker.is_lfs(prepared_file_path, repo_type) else 'normal'
|
|
|
|
|
opt._is_uploaded = is_uploaded
|
|
|
|
|
operations.append(opt)
|
|
|
|
|
|
2025-02-05 20:09:18 +08:00
|
|
|
print(f'Committing folder to {repo_id} ...')
|
|
|
|
|
commit_info: CommitInfo = self.create_commit(
|
2025-01-14 23:59:53 +08:00
|
|
|
repo_id=repo_id,
|
|
|
|
|
operations=operations,
|
|
|
|
|
commit_message=commit_message,
|
|
|
|
|
commit_description=commit_description,
|
|
|
|
|
token=token,
|
|
|
|
|
repo_type=repo_type,
|
2025-02-06 18:22:29 +08:00
|
|
|
revision=revision,
|
2025-01-14 23:59:53 +08:00
|
|
|
)
|
|
|
|
|
|
2025-02-05 20:09:18 +08:00
|
|
|
return commit_info
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
def _upload_blob(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
repo_id: str,
|
|
|
|
|
repo_type: str,
|
|
|
|
|
sha256: str,
|
|
|
|
|
size: int,
|
|
|
|
|
data: Union[str, Path, bytes, BinaryIO],
|
|
|
|
|
disable_tqdm: Optional[bool] = False,
|
|
|
|
|
tqdm_desc: Optional[str] = '[Uploading]',
|
|
|
|
|
buffer_size_mb: Optional[int] = 1,
|
|
|
|
|
) -> dict:
|
|
|
|
|
|
|
|
|
|
res_d: dict = dict(
|
|
|
|
|
url=None,
|
|
|
|
|
is_uploaded=False,
|
|
|
|
|
status_code=None,
|
|
|
|
|
status_msg=None,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
objects = [{'oid': sha256, 'size': size}]
|
|
|
|
|
upload_objects = self._validate_blob(
|
|
|
|
|
repo_id=repo_id,
|
|
|
|
|
repo_type=repo_type,
|
|
|
|
|
objects=objects,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# upload_object: {'url': 'xxx', 'oid': 'xxx'}
|
|
|
|
|
upload_object = upload_objects[0] if len(upload_objects) == 1 else None
|
|
|
|
|
|
|
|
|
|
if upload_object is None:
|
2025-02-05 20:09:18 +08:00
|
|
|
logger.info(f'Blob {sha256[:8]} has already uploaded, reuse it.')
|
2025-01-14 23:59:53 +08:00
|
|
|
res_d['is_uploaded'] = True
|
|
|
|
|
return res_d
|
|
|
|
|
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
cookies = dict(cookies) if cookies else None
|
|
|
|
|
if cookies is None:
|
|
|
|
|
raise ValueError('Token does not exist, please login first.')
|
|
|
|
|
|
|
|
|
|
self.headers.update({'Cookie': f"m_session_id={cookies['m_session_id']}"})
|
|
|
|
|
headers = self.builder_headers(self.headers)
|
|
|
|
|
|
|
|
|
|
def read_in_chunks(file_object, pbar, chunk_size=buffer_size_mb * 1024 * 1024):
|
|
|
|
|
"""Lazy function (generator) to read a file piece by piece."""
|
|
|
|
|
while True:
|
|
|
|
|
ck = file_object.read(chunk_size)
|
|
|
|
|
if not ck:
|
|
|
|
|
break
|
|
|
|
|
pbar.update(len(ck))
|
|
|
|
|
yield ck
|
|
|
|
|
|
|
|
|
|
with tqdm(
|
|
|
|
|
total=size,
|
|
|
|
|
unit='B',
|
|
|
|
|
unit_scale=True,
|
|
|
|
|
desc=tqdm_desc,
|
|
|
|
|
disable=disable_tqdm
|
|
|
|
|
) as pbar:
|
|
|
|
|
|
|
|
|
|
if isinstance(data, (str, Path)):
|
|
|
|
|
with open(data, 'rb') as f:
|
|
|
|
|
response = requests.put(
|
|
|
|
|
upload_object['url'],
|
|
|
|
|
headers=headers,
|
|
|
|
|
data=read_in_chunks(f, pbar)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
elif isinstance(data, bytes):
|
|
|
|
|
response = requests.put(
|
|
|
|
|
upload_object['url'],
|
|
|
|
|
headers=headers,
|
|
|
|
|
data=read_in_chunks(io.BytesIO(data), pbar)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
elif isinstance(data, io.BufferedIOBase):
|
|
|
|
|
response = requests.put(
|
|
|
|
|
upload_object['url'],
|
|
|
|
|
headers=headers,
|
|
|
|
|
data=read_in_chunks(data, pbar)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError('Invalid data type to upload')
|
|
|
|
|
|
|
|
|
|
resp = response.json()
|
|
|
|
|
raise_on_error(resp)
|
|
|
|
|
|
|
|
|
|
res_d['url'] = upload_object['url']
|
|
|
|
|
res_d['status_code'] = resp['Code']
|
|
|
|
|
res_d['status_msg'] = resp['Message']
|
|
|
|
|
|
|
|
|
|
return res_d
|
|
|
|
|
|
|
|
|
|
def _validate_blob(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
repo_id: str,
|
|
|
|
|
repo_type: str,
|
|
|
|
|
objects: List[Dict[str, Any]],
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: Optional[str] = None
|
2025-01-14 23:59:53 +08:00
|
|
|
) -> List[Dict[str, Any]]:
|
|
|
|
|
"""
|
|
|
|
|
Check the blob has already uploaded.
|
|
|
|
|
True -- uploaded; False -- not uploaded.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
repo_id (str): The repo id ModelScope.
|
|
|
|
|
repo_type (str): The repo type. `dataset`, `model`, etc.
|
|
|
|
|
objects (List[Dict[str, Any]]): The objects to check.
|
|
|
|
|
oid (str): The sha256 hash value.
|
|
|
|
|
size (int): The size of the blob.
|
2025-03-04 12:34:11 +08:00
|
|
|
endpoint: the endpoint to use, default to None to use endpoint specified in the class
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List[Dict[str, Any]]: The result of the check.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# construct URL
|
2025-03-04 12:34:11 +08:00
|
|
|
if not endpoint:
|
|
|
|
|
endpoint = self.endpoint
|
|
|
|
|
url = f'{endpoint}/api/v1/repos/{repo_type}s/{repo_id}/info/lfs/objects/batch'
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
# build payload
|
|
|
|
|
payload = {
|
|
|
|
|
'operation': 'upload',
|
|
|
|
|
'objects': objects,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
|
|
|
if cookies is None:
|
|
|
|
|
raise ValueError('Token does not exist, please login first.')
|
|
|
|
|
response = requests.post(
|
|
|
|
|
url,
|
|
|
|
|
headers=self.builder_headers(self.headers),
|
|
|
|
|
data=json.dumps(payload),
|
|
|
|
|
cookies=cookies
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
resp = response.json()
|
|
|
|
|
raise_on_error(resp)
|
|
|
|
|
|
2025-02-06 18:22:29 +08:00
|
|
|
upload_objects = [] # list of objects to upload, [{'url': 'xxx', 'oid': 'xxx'}, ...]
|
2025-01-14 23:59:53 +08:00
|
|
|
resp_objects = resp['Data']['objects']
|
|
|
|
|
for obj in resp_objects:
|
|
|
|
|
upload_objects.append(
|
|
|
|
|
{'url': obj['actions']['upload']['href'],
|
|
|
|
|
'oid': obj['oid']}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return upload_objects
|
|
|
|
|
|
|
|
|
|
def _prepare_upload_folder(
|
2025-02-06 18:22:29 +08:00
|
|
|
self,
|
|
|
|
|
folder_path_or_files: Union[str, Path, List[str], List[Path]],
|
|
|
|
|
path_in_repo: str,
|
|
|
|
|
allow_patterns: Optional[Union[List[str], str]] = None,
|
|
|
|
|
ignore_patterns: Optional[Union[List[str], str]] = None,
|
2025-01-14 23:59:53 +08:00
|
|
|
) -> List[Union[tuple, list]]:
|
2025-02-06 18:22:29 +08:00
|
|
|
folder_path = None
|
|
|
|
|
files_path = None
|
|
|
|
|
if isinstance(folder_path_or_files, list):
|
|
|
|
|
if os.path.isfile(folder_path_or_files[0]):
|
|
|
|
|
files_path = folder_path_or_files
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError('Uploading multiple folders is not supported now.')
|
|
|
|
|
else:
|
|
|
|
|
if os.path.isfile(folder_path_or_files):
|
|
|
|
|
files_path = [folder_path_or_files]
|
|
|
|
|
else:
|
|
|
|
|
folder_path = folder_path_or_files
|
|
|
|
|
|
|
|
|
|
if files_path is None:
|
|
|
|
|
self.upload_checker.check_folder(folder_path)
|
|
|
|
|
folder_path = Path(folder_path).expanduser().resolve()
|
|
|
|
|
if not folder_path.is_dir():
|
|
|
|
|
raise ValueError(f"Provided path: '{folder_path}' is not a directory")
|
|
|
|
|
|
|
|
|
|
# List files from folder
|
|
|
|
|
relpath_to_abspath = {
|
|
|
|
|
path.relative_to(folder_path).as_posix(): path
|
|
|
|
|
for path in sorted(folder_path.glob('**/*')) # sorted to be deterministic
|
|
|
|
|
if path.is_file()
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
relpath_to_abspath = {}
|
|
|
|
|
for path in files_path:
|
|
|
|
|
if os.path.isfile(path):
|
|
|
|
|
self.upload_checker.check_file(path)
|
|
|
|
|
relpath_to_abspath[os.path.basename(path)] = path
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
# Filter files
|
|
|
|
|
filtered_repo_objects = list(
|
|
|
|
|
RepoUtils.filter_repo_objects(
|
|
|
|
|
relpath_to_abspath.keys(), allow_patterns=allow_patterns, ignore_patterns=ignore_patterns
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
prefix = f"{path_in_repo.strip('/')}/" if path_in_repo else ''
|
|
|
|
|
|
|
|
|
|
prepared_repo_objects = [
|
|
|
|
|
(prefix + relpath, str(relpath_to_abspath[relpath]))
|
|
|
|
|
for relpath in filtered_repo_objects
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
return prepared_repo_objects
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _prepare_commit_payload(
|
|
|
|
|
operations: Iterable[CommitOperation],
|
|
|
|
|
commit_message: str,
|
|
|
|
|
) -> Dict[str, Any]:
|
|
|
|
|
"""
|
|
|
|
|
Prepare the commit payload to be sent to the ModelScope hub.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
payload = {
|
|
|
|
|
'commit_message': commit_message,
|
|
|
|
|
'actions': []
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
nb_ignored_files = 0
|
|
|
|
|
|
|
|
|
|
# 2. Send operations, one per line
|
|
|
|
|
for operation in operations:
|
|
|
|
|
|
|
|
|
|
# Skip ignored files
|
|
|
|
|
if isinstance(operation, CommitOperationAdd) and operation._should_ignore:
|
|
|
|
|
logger.debug(f"Skipping file '{operation.path_in_repo}' in commit (ignored by gitignore file).")
|
|
|
|
|
nb_ignored_files += 1
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 2.a. Case adding a normal file
|
|
|
|
|
if isinstance(operation, CommitOperationAdd) and operation._upload_mode == 'normal':
|
|
|
|
|
|
|
|
|
|
commit_action = {
|
|
|
|
|
'action': 'update' if operation._is_uploaded else 'create',
|
|
|
|
|
'path': operation.path_in_repo,
|
|
|
|
|
'type': 'normal',
|
|
|
|
|
'size': operation.upload_info.size,
|
|
|
|
|
'sha256': '',
|
|
|
|
|
'content': operation.b64content().decode(),
|
|
|
|
|
'encoding': 'base64',
|
|
|
|
|
}
|
|
|
|
|
payload['actions'].append(commit_action)
|
|
|
|
|
|
|
|
|
|
# 2.b. Case adding an LFS file
|
|
|
|
|
elif isinstance(operation, CommitOperationAdd) and operation._upload_mode == 'lfs':
|
|
|
|
|
|
|
|
|
|
commit_action = {
|
|
|
|
|
'action': 'update' if operation._is_uploaded else 'create',
|
|
|
|
|
'path': operation.path_in_repo,
|
|
|
|
|
'type': 'lfs',
|
|
|
|
|
'size': operation.upload_info.size,
|
|
|
|
|
'sha256': operation.upload_info.sha256,
|
|
|
|
|
'content': '',
|
|
|
|
|
'encoding': '',
|
|
|
|
|
}
|
|
|
|
|
payload['actions'].append(commit_action)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f'Unknown operation to commit. Operation: {operation}. Upload mode:'
|
|
|
|
|
f" {getattr(operation, '_upload_mode', None)}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if nb_ignored_files > 0:
|
|
|
|
|
logger.info(f'Skipped {nb_ignored_files} file(s) in commit (ignored by gitignore file).')
|
|
|
|
|
|
|
|
|
|
return payload
|
|
|
|
|
|
2022-06-21 20:04:25 +08:00
|
|
|
|
|
|
|
|
class ModelScopeConfig:
|
2022-08-02 18:16:28 +08:00
|
|
|
path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
|
|
|
|
|
COOKIES_FILE_NAME = 'cookies'
|
|
|
|
|
GIT_TOKEN_FILE_NAME = 'git_token'
|
|
|
|
|
USER_INFO_FILE_NAME = 'user'
|
2022-10-26 13:55:51 +08:00
|
|
|
USER_SESSION_ID_FILE_NAME = 'session'
|
2024-06-30 20:15:03 +08:00
|
|
|
cookie_expired_warning = False
|
2022-06-24 16:43:32 +08:00
|
|
|
|
2022-08-02 18:16:28 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def make_sure_credential_path_exist():
|
|
|
|
|
os.makedirs(ModelScopeConfig.path_credential, exist_ok=True)
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2022-08-02 18:16:28 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def save_cookies(cookies: CookieJar):
|
|
|
|
|
ModelScopeConfig.make_sure_credential_path_exist()
|
|
|
|
|
with open(
|
|
|
|
|
os.path.join(ModelScopeConfig.path_credential,
|
|
|
|
|
ModelScopeConfig.COOKIES_FILE_NAME), 'wb+') as f:
|
2022-06-21 20:04:25 +08:00
|
|
|
pickle.dump(cookies, f)
|
|
|
|
|
|
2022-08-02 18:16:28 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def get_cookies():
|
|
|
|
|
cookies_path = os.path.join(ModelScopeConfig.path_credential,
|
|
|
|
|
ModelScopeConfig.COOKIES_FILE_NAME)
|
2022-07-08 20:03:23 +08:00
|
|
|
if os.path.exists(cookies_path):
|
2022-06-24 16:43:32 +08:00
|
|
|
with open(cookies_path, 'rb') as f:
|
|
|
|
|
cookies = pickle.load(f)
|
|
|
|
|
for cookie in cookies:
|
2025-01-18 00:57:29 +08:00
|
|
|
if cookie.name == 'm_session_id' and cookie.is_expired() and \
|
|
|
|
|
not ModelScopeConfig.cookie_expired_warning:
|
2024-06-30 20:15:03 +08:00
|
|
|
ModelScopeConfig.cookie_expired_warning = True
|
2025-03-11 18:22:29 +08:00
|
|
|
logger.info('Not logged-in, you can login for uploading'
|
|
|
|
|
'or accessing controlled entities.')
|
2022-06-24 16:43:32 +08:00
|
|
|
return None
|
|
|
|
|
return cookies
|
|
|
|
|
return None
|
2022-06-21 20:04:25 +08:00
|
|
|
|
2022-10-26 13:55:51 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def get_user_session_id():
|
|
|
|
|
session_path = os.path.join(ModelScopeConfig.path_credential,
|
|
|
|
|
ModelScopeConfig.USER_SESSION_ID_FILE_NAME)
|
|
|
|
|
session_id = ''
|
|
|
|
|
if os.path.exists(session_path):
|
|
|
|
|
with open(session_path, 'rb') as f:
|
|
|
|
|
session_id = str(f.readline().strip(), encoding='utf-8')
|
|
|
|
|
return session_id
|
|
|
|
|
if session_id == '' or len(session_id) != 32:
|
|
|
|
|
session_id = str(uuid.uuid4().hex)
|
|
|
|
|
ModelScopeConfig.make_sure_credential_path_exist()
|
|
|
|
|
with open(session_path, 'w+') as wf:
|
|
|
|
|
wf.write(session_id)
|
|
|
|
|
|
|
|
|
|
return session_id
|
|
|
|
|
|
2022-08-02 18:16:28 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def save_token(token: str):
|
|
|
|
|
ModelScopeConfig.make_sure_credential_path_exist()
|
|
|
|
|
with open(
|
|
|
|
|
os.path.join(ModelScopeConfig.path_credential,
|
2022-08-03 15:41:43 +08:00
|
|
|
ModelScopeConfig.GIT_TOKEN_FILE_NAME), 'w+') as f:
|
2022-06-21 20:04:25 +08:00
|
|
|
f.write(token)
|
|
|
|
|
|
2022-08-02 18:16:28 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def save_user_info(user_name: str, user_email: str):
|
|
|
|
|
ModelScopeConfig.make_sure_credential_path_exist()
|
|
|
|
|
with open(
|
|
|
|
|
os.path.join(ModelScopeConfig.path_credential,
|
|
|
|
|
ModelScopeConfig.USER_INFO_FILE_NAME), 'w+') as f:
|
|
|
|
|
f.write('%s:%s' % (user_name, user_email))
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_user_info() -> Tuple[str, str]:
|
|
|
|
|
try:
|
|
|
|
|
with open(
|
|
|
|
|
os.path.join(ModelScopeConfig.path_credential,
|
|
|
|
|
ModelScopeConfig.USER_INFO_FILE_NAME),
|
2023-01-03 16:27:29 +08:00
|
|
|
'r',
|
|
|
|
|
encoding='utf-8') as f:
|
2022-08-02 18:16:28 +08:00
|
|
|
info = f.read()
|
|
|
|
|
return info.split(':')[0], info.split(':')[1]
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
pass
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_token() -> Optional[str]:
|
2022-06-21 20:04:25 +08:00
|
|
|
"""
|
|
|
|
|
Get token or None if not existent.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
`str` or `None`: The token, `None` if it doesn't exist.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
token = None
|
|
|
|
|
try:
|
2022-08-02 18:16:28 +08:00
|
|
|
with open(
|
|
|
|
|
os.path.join(ModelScopeConfig.path_credential,
|
2022-08-03 15:41:43 +08:00
|
|
|
ModelScopeConfig.GIT_TOKEN_FILE_NAME),
|
2023-01-03 16:27:29 +08:00
|
|
|
'r',
|
|
|
|
|
encoding='utf-8') as f:
|
2022-06-21 20:04:25 +08:00
|
|
|
token = f.read()
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
pass
|
|
|
|
|
return token
|
2022-10-26 13:55:51 +08:00
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_user_agent(user_agent: Union[Dict, str, None] = None, ) -> str:
|
|
|
|
|
"""Formats a user-agent string with basic info about a request.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
user_agent (`str`, `dict`, *optional*):
|
|
|
|
|
The user agent info in the form of a dictionary or a single string.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The formatted user-agent string.
|
|
|
|
|
"""
|
2022-11-18 13:17:19 +08:00
|
|
|
|
|
|
|
|
# include some more telemetrics when executing in dedicated
|
|
|
|
|
# cloud containers
|
2022-10-26 13:55:51 +08:00
|
|
|
env = 'custom'
|
2022-11-18 13:17:19 +08:00
|
|
|
if MODELSCOPE_CLOUD_ENVIRONMENT in os.environ:
|
|
|
|
|
env = os.environ[MODELSCOPE_CLOUD_ENVIRONMENT]
|
2022-10-31 22:46:17 +08:00
|
|
|
user_name = 'unknown'
|
2022-11-18 13:17:19 +08:00
|
|
|
if MODELSCOPE_CLOUD_USERNAME in os.environ:
|
|
|
|
|
user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
|
2022-10-26 13:55:51 +08:00
|
|
|
|
2023-05-13 12:12:04 +08:00
|
|
|
from modelscope import __version__
|
2022-10-31 22:46:17 +08:00
|
|
|
ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
|
2022-10-26 13:55:51 +08:00
|
|
|
__version__,
|
|
|
|
|
platform.python_version(),
|
|
|
|
|
ModelScopeConfig.get_user_session_id(),
|
|
|
|
|
platform.platform(),
|
|
|
|
|
platform.processor(),
|
|
|
|
|
env,
|
2022-10-31 22:46:17 +08:00
|
|
|
user_name,
|
2022-10-26 13:55:51 +08:00
|
|
|
)
|
|
|
|
|
if isinstance(user_agent, dict):
|
2022-12-21 08:28:40 +08:00
|
|
|
ua += '; ' + '; '.join(f'{k}/{v}' for k, v in user_agent.items())
|
2022-10-26 13:55:51 +08:00
|
|
|
elif isinstance(user_agent, str):
|
2022-12-21 08:28:40 +08:00
|
|
|
ua += '; ' + user_agent
|
2022-10-26 13:55:51 +08:00
|
|
|
return ua
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class UploadingCheck:
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
max_file_count: int = 100_000,
|
|
|
|
|
max_file_count_in_dir: int = 10_000,
|
|
|
|
|
max_file_size: int = 50 * 1024 ** 3,
|
2025-01-21 20:29:18 +08:00
|
|
|
size_threshold_to_enforce_lfs: int = 5 * 1024 * 1024,
|
2025-01-14 23:59:53 +08:00
|
|
|
normal_file_size_total_limit: int = 500 * 1024 * 1024,
|
|
|
|
|
):
|
|
|
|
|
self.max_file_count = max_file_count
|
|
|
|
|
self.max_file_count_in_dir = max_file_count_in_dir
|
|
|
|
|
self.max_file_size = max_file_size
|
2025-01-21 20:29:18 +08:00
|
|
|
self.size_threshold_to_enforce_lfs = size_threshold_to_enforce_lfs
|
2025-01-14 23:59:53 +08:00
|
|
|
self.normal_file_size_total_limit = normal_file_size_total_limit
|
|
|
|
|
|
|
|
|
|
def check_file(self, file_path_or_obj):
|
|
|
|
|
|
|
|
|
|
if isinstance(file_path_or_obj, (str, Path)):
|
|
|
|
|
if not os.path.exists(file_path_or_obj):
|
|
|
|
|
raise ValueError(f'File {file_path_or_obj} does not exist')
|
|
|
|
|
|
|
|
|
|
file_size: int = get_file_size(file_path_or_obj)
|
|
|
|
|
if file_size > self.max_file_size:
|
2025-01-21 20:29:18 +08:00
|
|
|
raise ValueError(f'File exceeds size limit: {self.max_file_size / (1024 ** 3)} GB, '
|
|
|
|
|
f'got {round(file_size / (1024 ** 3), 4)} GB')
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
def check_folder(self, folder_path: Union[str, Path]):
|
|
|
|
|
file_count = 0
|
|
|
|
|
dir_count = 0
|
|
|
|
|
|
|
|
|
|
if isinstance(folder_path, str):
|
|
|
|
|
folder_path = Path(folder_path)
|
|
|
|
|
|
|
|
|
|
for item in folder_path.iterdir():
|
|
|
|
|
if item.is_file():
|
|
|
|
|
file_count += 1
|
2025-01-21 20:29:18 +08:00
|
|
|
item_size: int = get_file_size(item)
|
|
|
|
|
if item_size > self.max_file_size:
|
|
|
|
|
raise ValueError(f'File {item} exceeds size limit: {self.max_file_size / (1024 ** 3)} GB',
|
|
|
|
|
f'got {round(item_size / (1024 ** 3), 4)} GB')
|
2025-01-14 23:59:53 +08:00
|
|
|
elif item.is_dir():
|
|
|
|
|
dir_count += 1
|
|
|
|
|
# Count items in subdirectories recursively
|
|
|
|
|
sub_file_count, sub_dir_count = self.check_folder(item)
|
|
|
|
|
if (sub_file_count + sub_dir_count) > self.max_file_count_in_dir:
|
|
|
|
|
raise ValueError(f'Directory {item} contains {sub_file_count + sub_dir_count} items '
|
|
|
|
|
f'and exceeds limit: {self.max_file_count_in_dir}')
|
|
|
|
|
file_count += sub_file_count
|
|
|
|
|
dir_count += sub_dir_count
|
|
|
|
|
|
|
|
|
|
if file_count > self.max_file_count:
|
|
|
|
|
raise ValueError(f'Total file count {file_count} and exceeds limit: {self.max_file_count}')
|
|
|
|
|
|
|
|
|
|
return file_count, dir_count
|
|
|
|
|
|
|
|
|
|
def is_lfs(self, file_path_or_obj: Union[str, Path, bytes, BinaryIO], repo_type: str) -> bool:
|
|
|
|
|
|
|
|
|
|
hit_lfs_suffix = True
|
|
|
|
|
|
|
|
|
|
if isinstance(file_path_or_obj, (str, Path)):
|
|
|
|
|
file_path_or_obj = Path(file_path_or_obj)
|
|
|
|
|
if not file_path_or_obj.exists():
|
|
|
|
|
raise ValueError(f'File {file_path_or_obj} does not exist')
|
|
|
|
|
|
|
|
|
|
if repo_type == REPO_TYPE_MODEL:
|
|
|
|
|
if file_path_or_obj.suffix not in MODEL_LFS_SUFFIX:
|
|
|
|
|
hit_lfs_suffix = False
|
|
|
|
|
elif repo_type == REPO_TYPE_DATASET:
|
|
|
|
|
if file_path_or_obj.suffix not in DATASET_LFS_SUFFIX:
|
|
|
|
|
hit_lfs_suffix = False
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
|
|
|
|
|
|
|
|
|
|
file_size: int = get_file_size(file_path_or_obj)
|
|
|
|
|
|
2025-01-21 20:29:18 +08:00
|
|
|
return file_size > self.size_threshold_to_enforce_lfs or hit_lfs_suffix
|
2025-01-14 23:59:53 +08:00
|
|
|
|
|
|
|
|
def check_normal_files(self, file_path_list: List[Union[str, Path]], repo_type: str) -> None:
|
|
|
|
|
|
|
|
|
|
normal_file_list = [item for item in file_path_list if not self.is_lfs(item, repo_type)]
|
|
|
|
|
total_size = sum([get_file_size(item) for item in normal_file_list])
|
|
|
|
|
|
|
|
|
|
if total_size > self.normal_file_size_total_limit:
|
2025-02-06 18:22:29 +08:00
|
|
|
raise ValueError(f'Total size of non-lfs files {total_size / (1024 * 1024)}MB '
|
|
|
|
|
f'and exceeds limit: {self.normal_file_size_total_limit / (1024 * 1024)}MB')
|