diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index b2118ea8..11263299 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -1168,8 +1168,9 @@ class HubApi: return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex), **headers} - def get_file_base_path(self, namespace: str, dataset_name: str) -> str: - return f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' + def get_file_base_path(self, repo_id: str) -> str: + _namespace, _dataset_name = repo_id.split('/') + return f'{self.endpoint}/api/v1/datasets/{_namespace}/{_dataset_name}/repo?' # return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath=' diff --git a/modelscope/hub/check_model.py b/modelscope/hub/check_model.py index 2cbfc5ef..cb4a2a29 100644 --- a/modelscope/hub/check_model.py +++ b/modelscope/hub/check_model.py @@ -39,6 +39,7 @@ def check_local_model_is_latest( """ try: model_id = get_model_id_from_cache(model_root_path) + model_id = model_id.replace('___', '.') # make headers headers = { 'user-agent': diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py index c6309fc2..4510280b 100644 --- a/modelscope/hub/snapshot_download.py +++ b/modelscope/hub/snapshot_download.py @@ -9,7 +9,8 @@ from pathlib import Path from typing import Dict, List, Optional, Union from modelscope.hub.api import HubApi, ModelScopeConfig -from modelscope.hub.constants import MODELSCOPE_SHOW_INDIVIDUAL_PROGRESS_THRESHOLD +from modelscope.hub.constants import \ + MODELSCOPE_SHOW_INDIVIDUAL_PROGRESS_THRESHOLD from modelscope.hub.errors import InvalidParameter from modelscope.hub.file_download import (create_temporary_directory_and_cache, download_file, get_file_download_url) @@ -480,6 +481,9 @@ def _download_file_lists( raise InvalidParameter( f'Invalid repo type: {repo_type}, supported types: {REPO_TYPE_SUPPORT}' ) + disable_tqdm = len( + filtered_repo_files + ) > MODELSCOPE_SHOW_INDIVIDUAL_PROGRESS_THRESHOLD # noqa download_file( url, repo_file, @@ -487,8 +491,7 @@ def _download_file_lists( cache, headers, cookies, - disable_tqdm=len(filtered_repo_files) > - MODELSCOPE_SHOW_INDIVIDUAL_PROGRESS_THRESHOLD, + disable_tqdm=disable_tqdm, ) if len(filtered_repo_files) > 0: diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py index 8bd768dc..4d6de81c 100644 --- a/modelscope/msdatasets/utils/hf_datasets_util.py +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -41,19 +41,19 @@ from datasets.packaged_modules import (_EXTENSION_TO_MODULE, _MODULE_TO_EXTENSIONS, _PACKAGED_DATASETS_MODULES) from datasets.utils import file_utils -from datasets.utils.file_utils import (OfflineModeIsEnabled, - _raise_if_offline_mode_is_enabled, +from datasets.utils.file_utils import (_raise_if_offline_mode_is_enabled, cached_path, is_local_path, is_relative_path, relative_to_absolute_path) from datasets.utils.info_utils import is_small_dataset from datasets.utils.metadata import MetadataConfigs -from datasets.utils.py_utils import get_imports, map_nested +from datasets.utils.py_utils import get_imports from datasets.utils.track import tracked_str from fsspec import filesystem from fsspec.core import _un_chain from fsspec.utils import stringify_path from huggingface_hub import (DatasetCard, DatasetCardData) +from huggingface_hub.errors import OfflineModeIsEnabled from huggingface_hub.hf_api import DatasetInfo as HfDatasetInfo from huggingface_hub.hf_api import HfApi, RepoFile, RepoFolder from packaging import version @@ -62,7 +62,8 @@ from modelscope import HubApi from modelscope.hub.utils.utils import get_endpoint from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms from modelscope.utils.config_ds import MS_DATASETS_CACHE -from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE +from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DEFAULT_DATASET_REVISION +from modelscope.utils.import_utils import has_attr_in_class from modelscope.utils.logger import get_logger logger = get_logger() @@ -97,7 +98,7 @@ def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) -> if is_relative_path(url_or_filename): # append the relative path to the base_path # url_or_filename = url_or_path_join(self._base_path, url_or_filename) - revision = revision or 'master' + revision = revision or DEFAULT_DATASET_REVISION # Note: make sure the FilePath is the last param params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': url_or_filename} params: str = urlencode(params) @@ -162,7 +163,7 @@ def _dataset_info( dataset_hub_id, dataset_type = _api.get_dataset_id_and_type( dataset_name=_dataset_name, namespace=_namespace) - revision: str = revision or 'master' + revision: str = revision or DEFAULT_DATASET_REVISION data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id, revision=revision, files_metadata=files_metadata, @@ -234,7 +235,7 @@ def _list_repo_tree( while True: data: dict = _api.list_repo_tree(dataset_name=_dataset_name, namespace=_namespace, - revision=revision or 'master', + revision=revision or DEFAULT_DATASET_REVISION, root_path=path_in_repo or None, recursive=True, page_number=page_number, @@ -277,7 +278,7 @@ def _get_paths_info( dataset_hub_id, dataset_type = _api.get_dataset_id_and_type( dataset_name=_dataset_name, namespace=_namespace) - revision: str = revision or 'master' + revision: str = revision or DEFAULT_DATASET_REVISION data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id, revision=revision, files_metadata=False, @@ -296,6 +297,29 @@ def _get_paths_info( ] +def _download_repo_file(repo_id: str, path_in_repo: str, download_config: DownloadConfig, revision: str): + _api = HubApi() + _namespace, _dataset_name = repo_id.split('/') + + if download_config and download_config.download_desc is None: + download_config.download_desc = f'Downloading [{path_in_repo}]' + try: + url_or_filename = _api.get_dataset_file_url( + file_name=path_in_repo, + dataset_name=_dataset_name, + namespace=_namespace, + revision=revision, + extension_filter=False, + ) + repo_file_path = cached_path( + url_or_filename=url_or_filename, download_config=download_config) + except FileNotFoundError as e: + repo_file_path = '' + logger.error(e) + + return repo_file_path + + def get_fs_token_paths( urlpath, storage_options=None, @@ -536,9 +560,6 @@ def _get_data_patterns( def get_module_without_script(self) -> DatasetModule: - _ms_api = HubApi() - _repo_id: str = self.name - _namespace, _dataset_name = _repo_id.split('/') # hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info( # self.name, @@ -549,28 +570,20 @@ def get_module_without_script(self) -> DatasetModule: # even if metadata_configs is not None (which means that we will resolve files for each config later) # we cannot skip resolving all files because we need to infer module name by files extensions # revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime - revision = self.revision or 'master' + revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip( '/') + repo_id: str = self.name download_config = self.download_config.copy() - if download_config.download_desc is None: - download_config.download_desc = 'Downloading [README.md]' - try: - url_or_filename = _ms_api.get_dataset_file_url( - file_name='README.md', - dataset_name=_dataset_name, - namespace=_namespace, - revision=revision, - extension_filter=False, - ) - dataset_readme_path = cached_path( - url_or_filename=url_or_filename, download_config=download_config) - dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data - except FileNotFoundError: - dataset_card_data = DatasetCardData() + dataset_readme_path = _download_repo_file( + repo_id=repo_id, + path_in_repo='README.md', + download_config=download_config, + revision=revision) + dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data if dataset_readme_path else DatasetCardData() subset_name: str = download_config.storage_options.get('name', None) metadata_configs = MetadataConfigs.from_dataset_card_data( @@ -646,10 +659,7 @@ def get_module_without_script(self) -> DatasetModule: builder_kwargs = { # "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"), 'base_path': - _ms_api.get_file_base_path( - namespace=_namespace, - dataset_name=_dataset_name, - ), + HubApi().get_file_base_path(repo_id=repo_id), 'repo_id': self.name, 'dataset_name': @@ -760,20 +770,22 @@ def _download_additional_modules( def get_module_with_script(self) -> DatasetModule: - _api = HubApi() - _dataset_name: str = self.name.split('/')[-1] - _namespace: str = self.name.split('/')[0] + repo_id: str = self.name + _namespace, _dataset_name = repo_id.split('/') + revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION script_file_name = f'{_dataset_name}.py' - script_url: str = _api.get_dataset_file_url( - file_name=script_file_name, - dataset_name=_dataset_name, - namespace=_namespace, - revision=self.revision, - extension_filter=False, + local_script_path = _download_repo_file( + repo_id=repo_id, + path_in_repo=script_file_name, + download_config=self.download_config, + revision=revision, ) - local_script_path = cached_path( - url_or_filename=script_url, download_config=self.download_config) + if not local_script_path: + raise FileNotFoundError( + f'Cannot find {script_file_name} in {repo_id} at revision {revision}. ' + f'Please create {script_file_name} in the repo.' + ) dataset_infos_path = None # try: @@ -790,22 +802,19 @@ def get_module_with_script(self) -> DatasetModule: # logger.info(f'Cannot find dataset_infos.json: {e}') # dataset_infos_path = None - dataset_readme_url: str = _api.get_dataset_file_url( - file_name='README.md', - dataset_name=_dataset_name, - namespace=_namespace, - revision=self.revision, - extension_filter=False, + dataset_readme_path = _download_repo_file( + repo_id=repo_id, + path_in_repo='README.md', + download_config=self.download_config, + revision=revision ) - dataset_readme_path = cached_path( - url_or_filename=dataset_readme_url, download_config=self.download_config) imports = get_imports(local_script_path) local_imports = _download_additional_modules( - name=self.name, + name=repo_id, dataset_name=_dataset_name, namespace=_namespace, - revision=self.revision, + revision=revision, imports=imports, download_config=self.download_config, ) @@ -821,7 +830,7 @@ def get_module_with_script(self) -> DatasetModule: dynamic_modules_path=dynamic_modules_path, module_namespace='datasets', subdirectory_name=hash, - name=self.name, + name=repo_id, ) if not os.path.exists(importable_file_path): trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name) @@ -833,12 +842,12 @@ def get_module_with_script(self) -> DatasetModule: dynamic_modules_path=dynamic_modules_path, module_namespace='datasets', subdirectory_name=hash, - name=self.name, + name=repo_id, download_mode=self.download_mode, ) else: raise ValueError( - f'Loading {self.name} requires you to execute the dataset script in that' + f'Loading {repo_id} requires you to execute the dataset script in that' ' repo on your local machine. Make sure you have read the code there to avoid malicious use, then' ' set the option `trust_remote_code=True` to remove this error.' ) @@ -846,14 +855,14 @@ def get_module_with_script(self) -> DatasetModule: dynamic_modules_path=dynamic_modules_path, module_namespace='datasets', subdirectory_name=hash, - name=self.name, + name=repo_id, ) # make the new module to be noticed by the import system importlib.invalidate_caches() builder_kwargs = { # "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"), - 'base_path': _api.get_file_base_path(namespace=_namespace, dataset_name=_dataset_name), - 'repo_id': self.name, + 'base_path': HubApi().get_file_base_path(repo_id=repo_id), + 'repo_id': repo_id, } return DatasetModule(module_path, hash, builder_kwargs) @@ -1126,9 +1135,11 @@ class DatasetsWrapperHF: ) -> DatasetModule: subset_name: str = download_kwargs.pop('name', None) + revision = revision or DEFAULT_DATASET_REVISION if download_config is None: download_config = DownloadConfig(**download_kwargs) download_config.storage_options.update({'name': subset_name}) + download_config.storage_options.update({'revision': revision}) if download_config and download_config.cache_dir is None: download_config.cache_dir = MS_DATASETS_CACHE @@ -1197,7 +1208,7 @@ class DatasetsWrapperHF: data_files=data_files, download_mode=download_mode).get_module() # Try remotely - elif is_relative_path(path) and path.count('/') <= 1: + elif is_relative_path(path) and path.count('/') == 1: try: _raise_if_offline_mode_is_enabled() @@ -1236,6 +1247,15 @@ class DatasetsWrapperHF: ) else: raise e + + dataset_readme_path = _download_repo_file( + repo_id=path, + path_in_repo='README.md', + download_config=download_config, + revision=revision, + ) + commit_hash = os.path.basename(os.path.dirname(dataset_readme_path)) + if filename in [ sibling.rfilename for sibling in dataset_info.siblings ]: # contains a dataset script @@ -1264,26 +1284,54 @@ class DatasetsWrapperHF: # This fails when the dataset has multiple configs and a default config and # the user didn't specify a configuration name (_require_default_config_name=True). try: + if has_attr_in_class(HubDatasetModuleFactoryWithParquetExport, 'revision'): + return HubDatasetModuleFactoryWithParquetExport( + path, + revision=revision, + download_config=download_config).get_module() + return HubDatasetModuleFactoryWithParquetExport( path, - download_config=download_config, - revision=dataset_info.sha).get_module() + commit_hash=commit_hash, + download_config=download_config).get_module() except Exception as e: logger.error(e) # Otherwise we must use the dataset script if the user trusts it + # To be adapted to the old version of datasets + if has_attr_in_class(HubDatasetModuleFactoryWithScript, 'revision'): + return HubDatasetModuleFactoryWithScript( + path, + revision=revision, + download_config=download_config, + download_mode=download_mode, + dynamic_modules_path=dynamic_modules_path, + trust_remote_code=trust_remote_code, + ).get_module() + return HubDatasetModuleFactoryWithScript( path, - revision=revision, + commit_hash=commit_hash, download_config=download_config, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path, trust_remote_code=trust_remote_code, ).get_module() else: + # To be adapted to the old version of datasets + if has_attr_in_class(HubDatasetModuleFactoryWithoutScript, 'revision'): + return HubDatasetModuleFactoryWithoutScript( + path, + revision=revision, + data_dir=data_dir, + data_files=data_files, + download_config=download_config, + download_mode=download_mode, + ).get_module() + return HubDatasetModuleFactoryWithoutScript( path, - revision=revision, + commit_hash=commit_hash, data_dir=data_dir, data_files=data_files, download_config=download_config, @@ -1292,6 +1340,7 @@ class DatasetsWrapperHF: except Exception as e1: # All the attempts failed, before raising the error we should check if the module is already cached logger.error(f'>> Error loading {path}: {e1}') + try: return CachedDatasetModuleFactory( path, diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py index 3686e2fc..b1457f4b 100644 --- a/modelscope/msdatasets/utils/oss_utils.py +++ b/modelscope/msdatasets/utils/oss_utils.py @@ -4,7 +4,6 @@ from __future__ import print_function import multiprocessing import os -import oss2 from datasets.utils.file_utils import hash_url_to_filename from modelscope.hub.api import HubApi @@ -40,6 +39,7 @@ class OssUtilities: self.multipart_threshold = 50 * 1024 * 1024 self.max_retries = 3 + import oss2 self.resumable_store_download = oss2.ResumableDownloadStore( root=self.resumable_store_root_path) self.resumable_store_upload = oss2.ResumableStore( @@ -47,6 +47,8 @@ class OssUtilities: self.api = HubApi() def _do_init(self, oss_config): + import oss2 + self.key = oss_config[ACCESS_ID] self.secret = oss_config[ACCESS_SECRET] self.token = oss_config[SECURITY_TOKEN] @@ -78,6 +80,7 @@ class OssUtilities: def download(self, oss_file_name: str, download_config: DataDownloadConfig): + import oss2 cache_dir = download_config.cache_dir candidate_key = os.path.join(self.oss_dir, oss_file_name) candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name) @@ -126,6 +129,7 @@ class OssUtilities: def upload(self, oss_object_name: str, local_file_path: str, indicate_individual_progress: bool, upload_mode: UploadMode) -> str: + import oss2 retry_count = 0 object_key = os.path.join(self.oss_dir, oss_object_name) diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py index a3297684..984df7af 100644 --- a/modelscope/utils/import_utils.py +++ b/modelscope/utils/import_utils.py @@ -3,6 +3,7 @@ import ast import functools import importlib +import inspect import logging import os import os.path as osp @@ -480,3 +481,23 @@ class LazyImportModule(ModuleType): importlib.import_module(module_name) else: logger.warning(f'{signature} not found in ast index file') + + +def has_attr_in_class(cls, attribute_name) -> bool: + """ + Determine if attribute in specific class. + + Args: + cls: target class. + attribute_name: the attribute name. + + Returns: + The attribute in the class or not. + """ + init_method = cls.__init__ + signature = inspect.signature(init_method) + + parameters = signature.parameters + param_names = list(parameters.keys()) + + return attribute_name in param_names diff --git a/requirements/datasets.txt b/requirements/datasets.txt index d20154e1..f6eece3f 100644 --- a/requirements/datasets.txt +++ b/requirements/datasets.txt @@ -1,6 +1,6 @@ addict attrs -datasets>=3.0.0,<=3.0.1 +datasets>=3.0.0,<=3.2.0 einops oss2 Pillow diff --git a/requirements/framework.txt b/requirements/framework.txt index 9aa4c045..35a809c0 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,8 +1,7 @@ addict attrs -datasets>=3.0.0,<=3.0.1 +datasets>=3.0.0,<=3.2.0 einops -oss2 Pillow python-dateutil>=2.1 scipy