mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 20:19:51 +01:00
Merge remote-tracking branch 'origin' into ms-swift3_0
This commit is contained in:
@@ -1168,8 +1168,9 @@ class HubApi:
|
||||
return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
|
||||
**headers}
|
||||
|
||||
def get_file_base_path(self, namespace: str, dataset_name: str) -> str:
|
||||
return f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'
|
||||
def get_file_base_path(self, repo_id: str) -> str:
|
||||
_namespace, _dataset_name = repo_id.split('/')
|
||||
return f'{self.endpoint}/api/v1/datasets/{_namespace}/{_dataset_name}/repo?'
|
||||
# return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='
|
||||
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ def check_local_model_is_latest(
|
||||
"""
|
||||
try:
|
||||
model_id = get_model_id_from_cache(model_root_path)
|
||||
model_id = model_id.replace('___', '.')
|
||||
# make headers
|
||||
headers = {
|
||||
'user-agent':
|
||||
|
||||
@@ -9,7 +9,8 @@ from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from modelscope.hub.api import HubApi, ModelScopeConfig
|
||||
from modelscope.hub.constants import MODELSCOPE_SHOW_INDIVIDUAL_PROGRESS_THRESHOLD
|
||||
from modelscope.hub.constants import \
|
||||
MODELSCOPE_SHOW_INDIVIDUAL_PROGRESS_THRESHOLD
|
||||
from modelscope.hub.errors import InvalidParameter
|
||||
from modelscope.hub.file_download import (create_temporary_directory_and_cache,
|
||||
download_file, get_file_download_url)
|
||||
@@ -480,6 +481,9 @@ def _download_file_lists(
|
||||
raise InvalidParameter(
|
||||
f'Invalid repo type: {repo_type}, supported types: {REPO_TYPE_SUPPORT}'
|
||||
)
|
||||
disable_tqdm = len(
|
||||
filtered_repo_files
|
||||
) > MODELSCOPE_SHOW_INDIVIDUAL_PROGRESS_THRESHOLD # noqa
|
||||
download_file(
|
||||
url,
|
||||
repo_file,
|
||||
@@ -487,8 +491,7 @@ def _download_file_lists(
|
||||
cache,
|
||||
headers,
|
||||
cookies,
|
||||
disable_tqdm=len(filtered_repo_files) >
|
||||
MODELSCOPE_SHOW_INDIVIDUAL_PROGRESS_THRESHOLD,
|
||||
disable_tqdm=disable_tqdm,
|
||||
)
|
||||
|
||||
if len(filtered_repo_files) > 0:
|
||||
|
||||
@@ -41,19 +41,19 @@ from datasets.packaged_modules import (_EXTENSION_TO_MODULE,
|
||||
_MODULE_TO_EXTENSIONS,
|
||||
_PACKAGED_DATASETS_MODULES)
|
||||
from datasets.utils import file_utils
|
||||
from datasets.utils.file_utils import (OfflineModeIsEnabled,
|
||||
_raise_if_offline_mode_is_enabled,
|
||||
from datasets.utils.file_utils import (_raise_if_offline_mode_is_enabled,
|
||||
cached_path, is_local_path,
|
||||
is_relative_path,
|
||||
relative_to_absolute_path)
|
||||
from datasets.utils.info_utils import is_small_dataset
|
||||
from datasets.utils.metadata import MetadataConfigs
|
||||
from datasets.utils.py_utils import get_imports, map_nested
|
||||
from datasets.utils.py_utils import get_imports
|
||||
from datasets.utils.track import tracked_str
|
||||
from fsspec import filesystem
|
||||
from fsspec.core import _un_chain
|
||||
from fsspec.utils import stringify_path
|
||||
from huggingface_hub import (DatasetCard, DatasetCardData)
|
||||
from huggingface_hub.errors import OfflineModeIsEnabled
|
||||
from huggingface_hub.hf_api import DatasetInfo as HfDatasetInfo
|
||||
from huggingface_hub.hf_api import HfApi, RepoFile, RepoFolder
|
||||
from packaging import version
|
||||
@@ -62,7 +62,8 @@ from modelscope import HubApi
|
||||
from modelscope.hub.utils.utils import get_endpoint
|
||||
from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms
|
||||
from modelscope.utils.config_ds import MS_DATASETS_CACHE
|
||||
from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE
|
||||
from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DEFAULT_DATASET_REVISION
|
||||
from modelscope.utils.import_utils import has_attr_in_class
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
@@ -97,7 +98,7 @@ def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) ->
|
||||
if is_relative_path(url_or_filename):
|
||||
# append the relative path to the base_path
|
||||
# url_or_filename = url_or_path_join(self._base_path, url_or_filename)
|
||||
revision = revision or 'master'
|
||||
revision = revision or DEFAULT_DATASET_REVISION
|
||||
# Note: make sure the FilePath is the last param
|
||||
params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': url_or_filename}
|
||||
params: str = urlencode(params)
|
||||
@@ -162,7 +163,7 @@ def _dataset_info(
|
||||
dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
|
||||
dataset_name=_dataset_name, namespace=_namespace)
|
||||
|
||||
revision: str = revision or 'master'
|
||||
revision: str = revision or DEFAULT_DATASET_REVISION
|
||||
data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
|
||||
revision=revision,
|
||||
files_metadata=files_metadata,
|
||||
@@ -234,7 +235,7 @@ def _list_repo_tree(
|
||||
while True:
|
||||
data: dict = _api.list_repo_tree(dataset_name=_dataset_name,
|
||||
namespace=_namespace,
|
||||
revision=revision or 'master',
|
||||
revision=revision or DEFAULT_DATASET_REVISION,
|
||||
root_path=path_in_repo or None,
|
||||
recursive=True,
|
||||
page_number=page_number,
|
||||
@@ -277,7 +278,7 @@ def _get_paths_info(
|
||||
dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
|
||||
dataset_name=_dataset_name, namespace=_namespace)
|
||||
|
||||
revision: str = revision or 'master'
|
||||
revision: str = revision or DEFAULT_DATASET_REVISION
|
||||
data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
|
||||
revision=revision,
|
||||
files_metadata=False,
|
||||
@@ -296,6 +297,29 @@ def _get_paths_info(
|
||||
]
|
||||
|
||||
|
||||
def _download_repo_file(repo_id: str, path_in_repo: str, download_config: DownloadConfig, revision: str):
|
||||
_api = HubApi()
|
||||
_namespace, _dataset_name = repo_id.split('/')
|
||||
|
||||
if download_config and download_config.download_desc is None:
|
||||
download_config.download_desc = f'Downloading [{path_in_repo}]'
|
||||
try:
|
||||
url_or_filename = _api.get_dataset_file_url(
|
||||
file_name=path_in_repo,
|
||||
dataset_name=_dataset_name,
|
||||
namespace=_namespace,
|
||||
revision=revision,
|
||||
extension_filter=False,
|
||||
)
|
||||
repo_file_path = cached_path(
|
||||
url_or_filename=url_or_filename, download_config=download_config)
|
||||
except FileNotFoundError as e:
|
||||
repo_file_path = ''
|
||||
logger.error(e)
|
||||
|
||||
return repo_file_path
|
||||
|
||||
|
||||
def get_fs_token_paths(
|
||||
urlpath,
|
||||
storage_options=None,
|
||||
@@ -536,9 +560,6 @@ def _get_data_patterns(
|
||||
|
||||
|
||||
def get_module_without_script(self) -> DatasetModule:
|
||||
_ms_api = HubApi()
|
||||
_repo_id: str = self.name
|
||||
_namespace, _dataset_name = _repo_id.split('/')
|
||||
|
||||
# hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
|
||||
# self.name,
|
||||
@@ -549,28 +570,20 @@ def get_module_without_script(self) -> DatasetModule:
|
||||
# even if metadata_configs is not None (which means that we will resolve files for each config later)
|
||||
# we cannot skip resolving all files because we need to infer module name by files extensions
|
||||
# revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime
|
||||
revision = self.revision or 'master'
|
||||
revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION
|
||||
base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip(
|
||||
'/')
|
||||
|
||||
repo_id: str = self.name
|
||||
download_config = self.download_config.copy()
|
||||
if download_config.download_desc is None:
|
||||
download_config.download_desc = 'Downloading [README.md]'
|
||||
try:
|
||||
url_or_filename = _ms_api.get_dataset_file_url(
|
||||
file_name='README.md',
|
||||
dataset_name=_dataset_name,
|
||||
namespace=_namespace,
|
||||
revision=revision,
|
||||
extension_filter=False,
|
||||
)
|
||||
|
||||
dataset_readme_path = cached_path(
|
||||
url_or_filename=url_or_filename, download_config=download_config)
|
||||
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
|
||||
except FileNotFoundError:
|
||||
dataset_card_data = DatasetCardData()
|
||||
dataset_readme_path = _download_repo_file(
|
||||
repo_id=repo_id,
|
||||
path_in_repo='README.md',
|
||||
download_config=download_config,
|
||||
revision=revision)
|
||||
|
||||
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data if dataset_readme_path else DatasetCardData()
|
||||
subset_name: str = download_config.storage_options.get('name', None)
|
||||
|
||||
metadata_configs = MetadataConfigs.from_dataset_card_data(
|
||||
@@ -646,10 +659,7 @@ def get_module_without_script(self) -> DatasetModule:
|
||||
builder_kwargs = {
|
||||
# "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
|
||||
'base_path':
|
||||
_ms_api.get_file_base_path(
|
||||
namespace=_namespace,
|
||||
dataset_name=_dataset_name,
|
||||
),
|
||||
HubApi().get_file_base_path(repo_id=repo_id),
|
||||
'repo_id':
|
||||
self.name,
|
||||
'dataset_name':
|
||||
@@ -760,20 +770,22 @@ def _download_additional_modules(
|
||||
|
||||
def get_module_with_script(self) -> DatasetModule:
|
||||
|
||||
_api = HubApi()
|
||||
_dataset_name: str = self.name.split('/')[-1]
|
||||
_namespace: str = self.name.split('/')[0]
|
||||
repo_id: str = self.name
|
||||
_namespace, _dataset_name = repo_id.split('/')
|
||||
revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION
|
||||
|
||||
script_file_name = f'{_dataset_name}.py'
|
||||
script_url: str = _api.get_dataset_file_url(
|
||||
file_name=script_file_name,
|
||||
dataset_name=_dataset_name,
|
||||
namespace=_namespace,
|
||||
revision=self.revision,
|
||||
extension_filter=False,
|
||||
local_script_path = _download_repo_file(
|
||||
repo_id=repo_id,
|
||||
path_in_repo=script_file_name,
|
||||
download_config=self.download_config,
|
||||
revision=revision,
|
||||
)
|
||||
local_script_path = cached_path(
|
||||
url_or_filename=script_url, download_config=self.download_config)
|
||||
if not local_script_path:
|
||||
raise FileNotFoundError(
|
||||
f'Cannot find {script_file_name} in {repo_id} at revision {revision}. '
|
||||
f'Please create {script_file_name} in the repo.'
|
||||
)
|
||||
|
||||
dataset_infos_path = None
|
||||
# try:
|
||||
@@ -790,22 +802,19 @@ def get_module_with_script(self) -> DatasetModule:
|
||||
# logger.info(f'Cannot find dataset_infos.json: {e}')
|
||||
# dataset_infos_path = None
|
||||
|
||||
dataset_readme_url: str = _api.get_dataset_file_url(
|
||||
file_name='README.md',
|
||||
dataset_name=_dataset_name,
|
||||
namespace=_namespace,
|
||||
revision=self.revision,
|
||||
extension_filter=False,
|
||||
dataset_readme_path = _download_repo_file(
|
||||
repo_id=repo_id,
|
||||
path_in_repo='README.md',
|
||||
download_config=self.download_config,
|
||||
revision=revision
|
||||
)
|
||||
dataset_readme_path = cached_path(
|
||||
url_or_filename=dataset_readme_url, download_config=self.download_config)
|
||||
|
||||
imports = get_imports(local_script_path)
|
||||
local_imports = _download_additional_modules(
|
||||
name=self.name,
|
||||
name=repo_id,
|
||||
dataset_name=_dataset_name,
|
||||
namespace=_namespace,
|
||||
revision=self.revision,
|
||||
revision=revision,
|
||||
imports=imports,
|
||||
download_config=self.download_config,
|
||||
)
|
||||
@@ -821,7 +830,7 @@ def get_module_with_script(self) -> DatasetModule:
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
module_namespace='datasets',
|
||||
subdirectory_name=hash,
|
||||
name=self.name,
|
||||
name=repo_id,
|
||||
)
|
||||
if not os.path.exists(importable_file_path):
|
||||
trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
|
||||
@@ -833,12 +842,12 @@ def get_module_with_script(self) -> DatasetModule:
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
module_namespace='datasets',
|
||||
subdirectory_name=hash,
|
||||
name=self.name,
|
||||
name=repo_id,
|
||||
download_mode=self.download_mode,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f'Loading {self.name} requires you to execute the dataset script in that'
|
||||
f'Loading {repo_id} requires you to execute the dataset script in that'
|
||||
' repo on your local machine. Make sure you have read the code there to avoid malicious use, then'
|
||||
' set the option `trust_remote_code=True` to remove this error.'
|
||||
)
|
||||
@@ -846,14 +855,14 @@ def get_module_with_script(self) -> DatasetModule:
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
module_namespace='datasets',
|
||||
subdirectory_name=hash,
|
||||
name=self.name,
|
||||
name=repo_id,
|
||||
)
|
||||
# make the new module to be noticed by the import system
|
||||
importlib.invalidate_caches()
|
||||
builder_kwargs = {
|
||||
# "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
|
||||
'base_path': _api.get_file_base_path(namespace=_namespace, dataset_name=_dataset_name),
|
||||
'repo_id': self.name,
|
||||
'base_path': HubApi().get_file_base_path(repo_id=repo_id),
|
||||
'repo_id': repo_id,
|
||||
}
|
||||
return DatasetModule(module_path, hash, builder_kwargs)
|
||||
|
||||
@@ -1126,9 +1135,11 @@ class DatasetsWrapperHF:
|
||||
) -> DatasetModule:
|
||||
|
||||
subset_name: str = download_kwargs.pop('name', None)
|
||||
revision = revision or DEFAULT_DATASET_REVISION
|
||||
if download_config is None:
|
||||
download_config = DownloadConfig(**download_kwargs)
|
||||
download_config.storage_options.update({'name': subset_name})
|
||||
download_config.storage_options.update({'revision': revision})
|
||||
|
||||
if download_config and download_config.cache_dir is None:
|
||||
download_config.cache_dir = MS_DATASETS_CACHE
|
||||
@@ -1197,7 +1208,7 @@ class DatasetsWrapperHF:
|
||||
data_files=data_files,
|
||||
download_mode=download_mode).get_module()
|
||||
# Try remotely
|
||||
elif is_relative_path(path) and path.count('/') <= 1:
|
||||
elif is_relative_path(path) and path.count('/') == 1:
|
||||
try:
|
||||
_raise_if_offline_mode_is_enabled()
|
||||
|
||||
@@ -1236,6 +1247,15 @@ class DatasetsWrapperHF:
|
||||
)
|
||||
else:
|
||||
raise e
|
||||
|
||||
dataset_readme_path = _download_repo_file(
|
||||
repo_id=path,
|
||||
path_in_repo='README.md',
|
||||
download_config=download_config,
|
||||
revision=revision,
|
||||
)
|
||||
commit_hash = os.path.basename(os.path.dirname(dataset_readme_path))
|
||||
|
||||
if filename in [
|
||||
sibling.rfilename for sibling in dataset_info.siblings
|
||||
]: # contains a dataset script
|
||||
@@ -1264,26 +1284,54 @@ class DatasetsWrapperHF:
|
||||
# This fails when the dataset has multiple configs and a default config and
|
||||
# the user didn't specify a configuration name (_require_default_config_name=True).
|
||||
try:
|
||||
if has_attr_in_class(HubDatasetModuleFactoryWithParquetExport, 'revision'):
|
||||
return HubDatasetModuleFactoryWithParquetExport(
|
||||
path,
|
||||
revision=revision,
|
||||
download_config=download_config).get_module()
|
||||
|
||||
return HubDatasetModuleFactoryWithParquetExport(
|
||||
path,
|
||||
download_config=download_config,
|
||||
revision=dataset_info.sha).get_module()
|
||||
commit_hash=commit_hash,
|
||||
download_config=download_config).get_module()
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
# Otherwise we must use the dataset script if the user trusts it
|
||||
# To be adapted to the old version of datasets
|
||||
if has_attr_in_class(HubDatasetModuleFactoryWithScript, 'revision'):
|
||||
return HubDatasetModuleFactoryWithScript(
|
||||
path,
|
||||
revision=revision,
|
||||
download_config=download_config,
|
||||
download_mode=download_mode,
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
trust_remote_code=trust_remote_code,
|
||||
).get_module()
|
||||
|
||||
return HubDatasetModuleFactoryWithScript(
|
||||
path,
|
||||
revision=revision,
|
||||
commit_hash=commit_hash,
|
||||
download_config=download_config,
|
||||
download_mode=download_mode,
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
trust_remote_code=trust_remote_code,
|
||||
).get_module()
|
||||
else:
|
||||
# To be adapted to the old version of datasets
|
||||
if has_attr_in_class(HubDatasetModuleFactoryWithoutScript, 'revision'):
|
||||
return HubDatasetModuleFactoryWithoutScript(
|
||||
path,
|
||||
revision=revision,
|
||||
data_dir=data_dir,
|
||||
data_files=data_files,
|
||||
download_config=download_config,
|
||||
download_mode=download_mode,
|
||||
).get_module()
|
||||
|
||||
return HubDatasetModuleFactoryWithoutScript(
|
||||
path,
|
||||
revision=revision,
|
||||
commit_hash=commit_hash,
|
||||
data_dir=data_dir,
|
||||
data_files=data_files,
|
||||
download_config=download_config,
|
||||
@@ -1292,6 +1340,7 @@ class DatasetsWrapperHF:
|
||||
except Exception as e1:
|
||||
# All the attempts failed, before raising the error we should check if the module is already cached
|
||||
logger.error(f'>> Error loading {path}: {e1}')
|
||||
|
||||
try:
|
||||
return CachedDatasetModuleFactory(
|
||||
path,
|
||||
|
||||
@@ -4,7 +4,6 @@ from __future__ import print_function
|
||||
import multiprocessing
|
||||
import os
|
||||
|
||||
import oss2
|
||||
from datasets.utils.file_utils import hash_url_to_filename
|
||||
|
||||
from modelscope.hub.api import HubApi
|
||||
@@ -40,6 +39,7 @@ class OssUtilities:
|
||||
self.multipart_threshold = 50 * 1024 * 1024
|
||||
self.max_retries = 3
|
||||
|
||||
import oss2
|
||||
self.resumable_store_download = oss2.ResumableDownloadStore(
|
||||
root=self.resumable_store_root_path)
|
||||
self.resumable_store_upload = oss2.ResumableStore(
|
||||
@@ -47,6 +47,8 @@ class OssUtilities:
|
||||
self.api = HubApi()
|
||||
|
||||
def _do_init(self, oss_config):
|
||||
import oss2
|
||||
|
||||
self.key = oss_config[ACCESS_ID]
|
||||
self.secret = oss_config[ACCESS_SECRET]
|
||||
self.token = oss_config[SECURITY_TOKEN]
|
||||
@@ -78,6 +80,7 @@ class OssUtilities:
|
||||
|
||||
def download(self, oss_file_name: str,
|
||||
download_config: DataDownloadConfig):
|
||||
import oss2
|
||||
cache_dir = download_config.cache_dir
|
||||
candidate_key = os.path.join(self.oss_dir, oss_file_name)
|
||||
candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
|
||||
@@ -126,6 +129,7 @@ class OssUtilities:
|
||||
def upload(self, oss_object_name: str, local_file_path: str,
|
||||
indicate_individual_progress: bool,
|
||||
upload_mode: UploadMode) -> str:
|
||||
import oss2
|
||||
retry_count = 0
|
||||
object_key = os.path.join(self.oss_dir, oss_object_name)
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
import ast
|
||||
import functools
|
||||
import importlib
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import os.path as osp
|
||||
@@ -480,3 +481,23 @@ class LazyImportModule(ModuleType):
|
||||
importlib.import_module(module_name)
|
||||
else:
|
||||
logger.warning(f'{signature} not found in ast index file')
|
||||
|
||||
|
||||
def has_attr_in_class(cls, attribute_name) -> bool:
|
||||
"""
|
||||
Determine if attribute in specific class.
|
||||
|
||||
Args:
|
||||
cls: target class.
|
||||
attribute_name: the attribute name.
|
||||
|
||||
Returns:
|
||||
The attribute in the class or not.
|
||||
"""
|
||||
init_method = cls.__init__
|
||||
signature = inspect.signature(init_method)
|
||||
|
||||
parameters = signature.parameters
|
||||
param_names = list(parameters.keys())
|
||||
|
||||
return attribute_name in param_names
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
addict
|
||||
attrs
|
||||
datasets>=3.0.0,<=3.0.1
|
||||
datasets>=3.0.0,<=3.2.0
|
||||
einops
|
||||
oss2
|
||||
Pillow
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
addict
|
||||
attrs
|
||||
datasets>=3.0.0,<=3.0.1
|
||||
datasets>=3.0.0,<=3.2.0
|
||||
einops
|
||||
oss2
|
||||
Pillow
|
||||
python-dateutil>=2.1
|
||||
scipy
|
||||
|
||||
Reference in New Issue
Block a user