From 1f88654aa1b9808660075e06a6966b467f648f01 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Fri, 7 Feb 2025 16:02:37 +0800 Subject: [PATCH 1/2] support multiple include/exclude filter patterns in command line (#1214) Co-authored-by: Yingda Chen --- modelscope/cli/download.py | 9 +++++---- modelscope/cli/upload.py | 7 ++++--- modelscope/hub/utils/utils.py | 19 +++++++++++++++++++ tests/fileio/test_file.py | 16 ++++++++++++++++ 4 files changed, 44 insertions(+), 7 deletions(-) diff --git a/modelscope/cli/download.py b/modelscope/cli/download.py index 321c2b5d..6b430453 100644 --- a/modelscope/cli/download.py +++ b/modelscope/cli/download.py @@ -8,6 +8,7 @@ from modelscope.hub.file_download import (dataset_file_download, model_file_download) from modelscope.hub.snapshot_download import (dataset_snapshot_download, snapshot_download) +from modelscope.hub.utils.utils import convert_patterns from modelscope.utils.constant import DEFAULT_DATASET_REVISION @@ -141,8 +142,8 @@ class DownloadCMD(CLICommand): revision=self.args.revision, cache_dir=self.args.cache_dir, local_dir=self.args.local_dir, - allow_file_pattern=self.args.include, - ignore_file_pattern=self.args.exclude, + allow_file_pattern=convert_patterns(self.args.include), + ignore_file_pattern=convert_patterns(self.args.exclude), max_workers=self.args.max_workers, ) elif self.args.dataset: @@ -170,8 +171,8 @@ class DownloadCMD(CLICommand): revision=dataset_revision, cache_dir=self.args.cache_dir, local_dir=self.args.local_dir, - allow_file_pattern=self.args.include, - ignore_file_pattern=self.args.exclude, + allow_file_pattern=convert_patterns(self.args.include), + ignore_file_pattern=convert_patterns(self.args.exclude), max_workers=self.args.max_workers, ) else: diff --git a/modelscope/cli/upload.py b/modelscope/cli/upload.py index 29dacbe5..d32abdcc 100644 --- a/modelscope/cli/upload.py +++ b/modelscope/cli/upload.py @@ -4,6 +4,7 @@ from argparse import ArgumentParser, _SubParsersAction from modelscope.cli.base import CLICommand from modelscope.hub.api import HubApi, ModelScopeConfig +from modelscope.hub.utils.utils import convert_patterns, get_endpoint from modelscope.utils.constant import REPO_TYPE_MODEL, REPO_TYPE_SUPPORT @@ -89,7 +90,7 @@ class UploadCMD(CLICommand): parser.add_argument( '--endpoint', type=str, - default='https://www.modelscope.cn', + default=get_endpoint(), help='Endpoint for Modelscope service.') parser.set_defaults(func=subparser_func) @@ -166,8 +167,8 @@ class UploadCMD(CLICommand): commit_message=self.args.commit_message, commit_description=self.args.commit_description, repo_type=self.args.repo_type, - allow_patterns=self.args.include, - ignore_patterns=self.args.exclude, + allow_file_pattern=convert_patterns(self.args.include), + ignore_file_pattern=convert_patterns(self.args.exclude), max_workers=self.args.max_workers, ) else: diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index 3f3a4c75..3ad96fe2 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -31,6 +31,25 @@ def model_id_to_group_owner_name(model_id): return group_or_owner, name +def convert_patterns(raw_input: Union[str, List[str]]): + output = None + if isinstance(raw_input, str): + output = list() + if ',' in raw_input: + output = [s.strip() for s in raw_input.split(',')] + else: + output.append(raw_input.strip()) + elif isinstance(raw_input, list): + output = list() + for s in raw_input: + if isinstance(s, str): + if ',' in s: + output.extend([ss.strip() for ss in s.split(',')]) + else: + output.append(s.strip()) + return output + + # during model download, the '.' would be converted to '___' to produce # actual physical (masked) directory for storage def get_model_masked_directory(directory, model_id): diff --git a/tests/fileio/test_file.py b/tests/fileio/test_file.py index ded8ece7..383e8231 100644 --- a/tests/fileio/test_file.py +++ b/tests/fileio/test_file.py @@ -6,10 +6,26 @@ import unittest from requests import HTTPError from modelscope.fileio.file import File, HTTPStorage, LocalStorage +from modelscope.hub.utils.utils import convert_patterns class FileTest(unittest.TestCase): + def test_pattern_conversion(self): + self._assert_patterns(None, None) + self._assert_patterns('*.h5', ['*.h5']) + self._assert_patterns('*.h5 ', ['*.h5']) + self._assert_patterns('*.h5, *flax_model.msgpack', + ['*.h5', '*flax_model.msgpack']) + self._assert_patterns(['*.h5, *flax_model.msgpack'], + ['*.h5', '*flax_model.msgpack']) + self._assert_patterns(['*.h5 ', '*flax_model.msgpack'], + ['*.h5', '*flax_model.msgpack']) + + def _assert_patterns(self, raw_input, expected_output): + output = convert_patterns(raw_input) + self.assertEqual(expected_output, output) + def test_local_storage(self): storage = LocalStorage() temp_name = tempfile.gettempdir() + '/' + next( From b5bb6d7bb0c964463fa1a2d7bce00846a1f1e107 Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Fri, 7 Feb 2025 17:31:32 +0800 Subject: [PATCH 2/2] Use legacy cache (#1215) --- docker/Dockerfile.ubuntu | 2 +- docker/install.sh | 6 ++- modelscope/hub/file_download.py | 41 ++++++++++++++++++++ modelscope/hub/snapshot_download.py | 3 -- modelscope/utils/hf_util/patcher.py | 59 +++++++++++++++++++---------- 5 files changed, 85 insertions(+), 26 deletions(-) diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index 0ec13d12..cd48d85d 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -66,5 +66,5 @@ RUN echo "cache bust $(date +%Y%m%d%H%M%S)" && \ ENV SETUPTOOLS_USE_DISTUTILS=stdlib ENV VLLM_USE_MODELSCOPE=True ENV LMDEPLOY_USE_MODELSCOPE=True -ENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope +ENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope/hub SHELL ["/bin/bash", "-c"] diff --git a/docker/install.sh b/docker/install.sh index d7d367dc..ee747d20 100644 --- a/docker/install.sh +++ b/docker/install.sh @@ -8,12 +8,14 @@ lmdeploy_version=${5:-0.6.1} autogptq_version=${6:-0.7.1} flashattn_version=${7:-2.7.1.post4} -pip install --no-cache-dir -U autoawq lmdeploy==$lmdeploy_version - pip uninstall -y torch torchvision torchaudio pip install --no-cache-dir torch==$torch_version torchvision==$torchvision_version torchaudio==$torchaudio_version +pip install --no-cache-dir -U autoawq lmdeploy==$lmdeploy_version + +pip install --no-cache-dir torch==$torch_version torchvision==$torchvision_version torchaudio==$torchaudio_version + pip install --no-cache-dir tiktoken transformers_stream_generator bitsandbytes deepspeed torchmetrics decord optimum # pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp310-cp310-linux_x86_64.whl diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index 00eb8abf..ee0f5d89 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -4,6 +4,7 @@ import copy import hashlib import io import os +import shutil import tempfile import urllib import uuid @@ -286,6 +287,41 @@ def _repo_file_download( temporary_cache_dir, cache, headers, cookies) +def move_legacy_cache_to_standard_dir(cache_dir: str, model_id: str): + if cache_dir.endswith(os.path.sep): + cache_dir = cache_dir.strip(os.path.sep) + legacy_cache_root = os.path.dirname(cache_dir) + base_name = os.path.basename(cache_dir) + if base_name == 'datasets': + # datasets will not be not affected + return + if not legacy_cache_root.endswith('hub'): + # Two scenarios: + # We have restructured ModelScope cache directory, + # Scenery 1: + # When MODELSCOPE_CACHE is not set, the default directory remains + # the same at ~/.cache/modelscope/hub + # Scenery 2: + # When MODELSCOPE_CACHE is not set, the cache directory is moved from + # $MODELSCOPE_CACHE/hub to $MODELSCOPE_CACHE/. In this case, + # we will be migrating the hub directory accordingly. + legacy_cache_root = os.path.join(legacy_cache_root, 'hub') + group_or_owner, name = model_id_to_group_owner_name(model_id) + name = name.replace('.', '___') + temporary_cache_dir = os.path.join(cache_dir, group_or_owner, name) + legacy_cache_dir = os.path.join(legacy_cache_root, group_or_owner, name) + if os.path.exists( + legacy_cache_dir) and not os.path.exists(temporary_cache_dir): + logger.info( + f'Legacy cache dir exists: {legacy_cache_dir}, move to {temporary_cache_dir}' + ) + try: + shutil.move(legacy_cache_dir, temporary_cache_dir) + except Exception: # noqa + # Failed, skip + pass + + def create_temporary_directory_and_cache(model_id: str, local_dir: str = None, cache_dir: str = None, @@ -294,6 +330,10 @@ def create_temporary_directory_and_cache(model_id: str, default_cache_root = get_model_cache_root() elif repo_type == REPO_TYPE_DATASET: default_cache_root = get_dataset_cache_root() + else: + raise ValueError( + f'repo_type only support model and dataset, but now is : {repo_type}' + ) group_or_owner, name = model_id_to_group_owner_name(model_id) if local_dir is not None: @@ -302,6 +342,7 @@ def create_temporary_directory_and_cache(model_id: str, else: if cache_dir is None: cache_dir = default_cache_root + move_legacy_cache_to_standard_dir(cache_dir, model_id) if isinstance(cache_dir, Path): cache_dir = str(cache_dir) temporary_cache_dir = os.path.join(cache_dir, TEMPORARY_FOLDER_NAME, diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py index 77b49847..2c79050c 100644 --- a/modelscope/hub/snapshot_download.py +++ b/modelscope/hub/snapshot_download.py @@ -17,7 +17,6 @@ from modelscope.hub.utils.utils import (get_model_masked_directory, model_id_to_group_owner_name) from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DEFAULT_MODEL_REVISION, - DEFAULT_REPOSITORY_REVISION, REPO_TYPE_DATASET, REPO_TYPE_MODEL, REPO_TYPE_SUPPORT) from modelscope.utils.file_utils import get_modelscope_cache_dir @@ -246,7 +245,6 @@ def _snapshot_download( _api = HubApi() if cookies is None: cookies = ModelScopeConfig.get_cookies() - repo_files = [] if repo_type == REPO_TYPE_MODEL: directory = os.path.abspath( local_dir) if local_dir is not None else os.path.join( @@ -313,7 +311,6 @@ def _snapshot_download( local_dir) if local_dir else os.path.join( system_cache, 'datasets', *repo_id.split('/')) print(f'Downloading Dataset to directory: {directory}') - group_or_owner, name = model_id_to_group_owner_name(repo_id) revision_detail = revision or DEFAULT_DATASET_REVISION diff --git a/modelscope/utils/hf_util/patcher.py b/modelscope/utils/hf_util/patcher.py index 43933ca9..74264c13 100644 --- a/modelscope/utils/hf_util/patcher.py +++ b/modelscope/utils/hf_util/patcher.py @@ -47,29 +47,48 @@ def get_all_imported_modules(): pass if importlib.util.find_spec('peft') is not None: - import peft - attributes = dir(peft) - imports = [attr for attr in attributes if not attr.startswith('__')] - all_imported_modules.extend( - [getattr(peft, _import) for _import in imports]) + try: + import peft + except: # noqa + pass + else: + attributes = dir(peft) + imports = [ + attr for attr in attributes if not attr.startswith('__') + ] + all_imported_modules.extend( + [getattr(peft, _import) for _import in imports]) if importlib.util.find_spec('diffusers') is not None: - import diffusers - if importlib.util.find_spec('diffusers') is not None: + try: + import diffusers + except: # noqa + pass + else: lazy_module = sys.modules['diffusers'] - _import_structure = lazy_module._import_structure - for key in _import_structure: - values = _import_structure[key] - for value in values: - if any([name in value - for name in diffusers_include_names]): - try: - module = importlib.import_module( - f'.{key}', diffusers.__name__) - value = getattr(module, value) - all_imported_modules.append(value) - except (ImportError, AttributeError): - pass + if hasattr(lazy_module, '_import_structure'): + _import_structure = lazy_module._import_structure + for key in _import_structure: + values = _import_structure[key] + for value in values: + if any([ + name in value + for name in diffusers_include_names + ]): + try: + module = importlib.import_module( + f'.{key}', diffusers.__name__) + value = getattr(module, value) + all_imported_modules.append(value) + except (ImportError, AttributeError): + pass + else: + attributes = dir(lazy_module) + imports = [ + attr for attr in attributes if not attr.startswith('__') + ] + all_imported_modules.extend( + [getattr(lazy_module, _import) for _import in imports]) return all_imported_modules