From 2efd31c5c1d2b81b195cf3fccc282ef33c3a304b Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Mon, 4 Nov 2024 10:42:34 +0800 Subject: [PATCH] improve upload model, remove requirment for configuration.json --- modelscope/hub/api.py | 46 +++++++++++++++++++++++------------ modelscope/hub/git.py | 17 +++++++++---- modelscope/hub/utils/utils.py | 24 ++++++++++++++++++ modelscope/utils/constant.py | 7 +++++- 4 files changed, 72 insertions(+), 22 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index e4b9b1af..d9157bca 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -22,6 +22,8 @@ import requests from requests import Session from requests.adapters import HTTPAdapter, Retry +from modelscope import utils +from modelscope.fileio import io from modelscope.hub.constants import (API_HTTP_CLIENT_MAX_RETRIES, API_HTTP_CLIENT_TIMEOUT, API_RESPONSE_FIELD_DATA, @@ -48,13 +50,14 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DEFAULT_MODEL_REVISION, DEFAULT_REPOSITORY_REVISION, MASTER_MODEL_BRANCH, META_FILES_FORMAT, + REPO_TYPE_MODEL, ConfigFields, DatasetFormations, DatasetMetaFormats, DatasetVisibilityMap, DownloadChannel, - DownloadMode, ModelFile, - VirgoDatasetConfig) + DownloadMode, Frameworks, ModelFile, + Tasks, VirgoDatasetConfig) from modelscope.utils.logger import get_logger -from .utils.utils import (get_endpoint, get_release_datetime, - model_id_to_group_owner_name) +from .utils.utils import (get_endpoint, get_readable_folder_size, + get_release_datetime, model_id_to_group_owner_name) logger = get_logger() @@ -268,7 +271,7 @@ class HubApi: Returns: True if the repository exists, False otherwise. """ - if (repo_type is not None) and repo_type.lower != 'model': + if (repo_type is not None) and repo_type.lower != REPO_TYPE_MODEL: raise Exception('Not support repo-type: %s' % repo_type) if (repo_id is None) or repo_id.count('/') != 1: raise Exception('Invalid repo_id: %s, must be of format namespace/name' % repo_type) @@ -280,16 +283,25 @@ class HubApi: r = self.session.get(path, cookies=cookies, headers=self.builder_headers(self.headers)) code = handle_http_response(r, logger, cookies, repo_id, False) - logger.info(f'check repo_exists status code {code}.') if code == 200: return True elif code == 404: return False else: + logger.warn(f'Check repo_exists return status code {code}.') raise Exception( 'Failed to check existence of repo: %s, make sure you have access authorization.' % repo_type) + @staticmethod + def _create_default_config(model_dir): + cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) + cfg = { + ConfigFields.framework: Frameworks.torch, + ConfigFields.task: Tasks.other, + } + io.dump(cfg, cfg_file) + def push_model(self, model_id: str, model_dir: str, @@ -357,23 +369,23 @@ class HubApi: raise InvalidParameter('model_dir must be a valid directory.') cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) if not os.path.exists(cfg_file): - raise ValueError(f'{model_dir} must contain a configuration.json.') + logger.warning( + f'No {ModelFile.CONFIGURATION} file found in {model_dir}, creating a default one.') + HubApi._create_default_config(model_dir) + cookies = ModelScopeConfig.get_cookies() if cookies is None: raise NotLoginException('Must login before upload!') files_to_save = os.listdir(model_dir) + folder_size = get_readable_folder_size(model_dir) if ignore_file_pattern is None: ignore_file_pattern = [] if isinstance(ignore_file_pattern, str): ignore_file_pattern = [ignore_file_pattern] - try: - self.get_model(model_id=model_id) - except Exception: - if visibility is None or license is None: - raise InvalidParameter( - 'visibility and license cannot be empty if want to create new repo' - ) - logger.info('Create new model %s' % model_id) + if visibility is None or license is None: + raise InvalidParameter('Visibility and License cannot be empty for new model.') + if not self.repo_exists(model_id): + logger.info('Creating new model [%s]' % model_id) self.create_model( model_id=model_id, visibility=visibility, @@ -382,11 +394,13 @@ class HubApi: original_model_id=original_model_id) tmp_dir = tempfile.mkdtemp() git_wrapper = GitCommandWrapper() + logger.info(f'Pushing folder {model_dir} as model {model_id}.') + logger.info(f'Total folder size {folder_size}, this may take a while depending on actual pushing size...') try: repo = Repository(model_dir=tmp_dir, clone_from=model_id) branches = git_wrapper.get_remote_branches(tmp_dir) if revision not in branches: - logger.info('Create new branch %s' % revision) + logger.info('Creating new branch %s' % revision) git_wrapper.new_branch(tmp_dir, revision) git_wrapper.checkout(tmp_dir, revision) files_in_repo = os.listdir(tmp_dir) diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py index 581f248f..144d9d69 100644 --- a/modelscope/hub/git.py +++ b/modelscope/hub/git.py @@ -56,11 +56,18 @@ class GitCommandWrapper(metaclass=Singleton): response.check_returncode() return response except subprocess.CalledProcessError as error: - output = 'stdout: %s, stderr: %s' % ( - response.stdout.decode('utf8'), error.stderr.decode('utf8')) - logger.error('Running git command: %s failed, output: %s.' % - (command, output)) - raise GitError(output) + std_out = response.stdout.decode('utf8') + std_err = error.stderr.decode('utf8') + if 'nothing to commit' in std_out: + logger.info( + 'Nothing to commit, your local repo is upto date with remote' + ) + return response + else: + logger.error( + 'Running git command: %s failed \n stdout: %s \n stderr: %s' + % (command, std_out, std_err)) + raise GitError(std_err) def config_auth_token(self, repo_dir, auth_token): url = self.get_repo_remote_url(repo_dir) diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index 3c3c75da..3ed853b1 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -29,6 +29,30 @@ def model_id_to_group_owner_name(model_id): return group_or_owner, name +def convert_readable_size(size_bytes): + import math + if size_bytes == 0: + return '0B' + size_name = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB') + i = int(math.floor(math.log(size_bytes, 1024))) + p = math.pow(1024, i) + s = round(size_bytes / p, 2) + return f'{s} {size_name[i]}' + + +def get_folder_size(folder_path): + total_size = 0 + for path in Path(folder_path).rglob('*'): + if path.is_file(): + total_size += path.stat().st_size + return total_size + + +# return a readable string that describe size of for a given folder (MB, GB etc.) +def get_readable_folder_size(folder_path) -> str: + return convert_readable_size(get_folder_size(folder_path=folder_path)) + + def get_cache_dir(model_id: Optional[str] = None): """cache dir precedence: function parameter > environment > ~/.cache/modelscope/hub diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index ae2f647c..ffc6f816 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -293,6 +293,10 @@ class ScienceTasks(object): protein_structure = 'protein-structure' +class Other(object): + other = 'other' + + class TasksIODescriptions(object): image_to_image = 'image_to_image', images_to_image = 'images_to_image', @@ -310,7 +314,8 @@ class TasksIODescriptions(object): efficient_diffusion_tuning = 'efficient_diffusion_tuning' -class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks, ScienceTasks): +class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks, ScienceTasks, + Other): """ Names for tasks supported by modelscope. Holds the standard task name to use for identifying different tasks.