mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-16 16:27:45 +01:00
support latest datasets version (#1163)
This commit is contained in:
@@ -1168,8 +1168,9 @@ class HubApi:
|
|||||||
return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
|
return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
|
||||||
**headers}
|
**headers}
|
||||||
|
|
||||||
def get_file_base_path(self, namespace: str, dataset_name: str) -> str:
|
def get_file_base_path(self, repo_id: str) -> str:
|
||||||
return f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'
|
_namespace, _dataset_name = repo_id.split('/')
|
||||||
|
return f'{self.endpoint}/api/v1/datasets/{_namespace}/{_dataset_name}/repo?'
|
||||||
# return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='
|
# return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -41,19 +41,19 @@ from datasets.packaged_modules import (_EXTENSION_TO_MODULE,
|
|||||||
_MODULE_TO_EXTENSIONS,
|
_MODULE_TO_EXTENSIONS,
|
||||||
_PACKAGED_DATASETS_MODULES)
|
_PACKAGED_DATASETS_MODULES)
|
||||||
from datasets.utils import file_utils
|
from datasets.utils import file_utils
|
||||||
from datasets.utils.file_utils import (OfflineModeIsEnabled,
|
from datasets.utils.file_utils import (_raise_if_offline_mode_is_enabled,
|
||||||
_raise_if_offline_mode_is_enabled,
|
|
||||||
cached_path, is_local_path,
|
cached_path, is_local_path,
|
||||||
is_relative_path,
|
is_relative_path,
|
||||||
relative_to_absolute_path)
|
relative_to_absolute_path)
|
||||||
from datasets.utils.info_utils import is_small_dataset
|
from datasets.utils.info_utils import is_small_dataset
|
||||||
from datasets.utils.metadata import MetadataConfigs
|
from datasets.utils.metadata import MetadataConfigs
|
||||||
from datasets.utils.py_utils import get_imports, map_nested
|
from datasets.utils.py_utils import get_imports
|
||||||
from datasets.utils.track import tracked_str
|
from datasets.utils.track import tracked_str
|
||||||
from fsspec import filesystem
|
from fsspec import filesystem
|
||||||
from fsspec.core import _un_chain
|
from fsspec.core import _un_chain
|
||||||
from fsspec.utils import stringify_path
|
from fsspec.utils import stringify_path
|
||||||
from huggingface_hub import (DatasetCard, DatasetCardData)
|
from huggingface_hub import (DatasetCard, DatasetCardData)
|
||||||
|
from huggingface_hub.errors import OfflineModeIsEnabled
|
||||||
from huggingface_hub.hf_api import DatasetInfo as HfDatasetInfo
|
from huggingface_hub.hf_api import DatasetInfo as HfDatasetInfo
|
||||||
from huggingface_hub.hf_api import HfApi, RepoFile, RepoFolder
|
from huggingface_hub.hf_api import HfApi, RepoFile, RepoFolder
|
||||||
from packaging import version
|
from packaging import version
|
||||||
@@ -62,7 +62,8 @@ from modelscope import HubApi
|
|||||||
from modelscope.hub.utils.utils import get_endpoint
|
from modelscope.hub.utils.utils import get_endpoint
|
||||||
from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms
|
from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms
|
||||||
from modelscope.utils.config_ds import MS_DATASETS_CACHE
|
from modelscope.utils.config_ds import MS_DATASETS_CACHE
|
||||||
from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE
|
from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DEFAULT_DATASET_REVISION
|
||||||
|
from modelscope.utils.import_utils import has_attr_in_class
|
||||||
from modelscope.utils.logger import get_logger
|
from modelscope.utils.logger import get_logger
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
@@ -97,7 +98,7 @@ def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) ->
|
|||||||
if is_relative_path(url_or_filename):
|
if is_relative_path(url_or_filename):
|
||||||
# append the relative path to the base_path
|
# append the relative path to the base_path
|
||||||
# url_or_filename = url_or_path_join(self._base_path, url_or_filename)
|
# url_or_filename = url_or_path_join(self._base_path, url_or_filename)
|
||||||
revision = revision or 'master'
|
revision = revision or DEFAULT_DATASET_REVISION
|
||||||
# Note: make sure the FilePath is the last param
|
# Note: make sure the FilePath is the last param
|
||||||
params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': url_or_filename}
|
params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': url_or_filename}
|
||||||
params: str = urlencode(params)
|
params: str = urlencode(params)
|
||||||
@@ -162,7 +163,7 @@ def _dataset_info(
|
|||||||
dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
|
dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
|
||||||
dataset_name=_dataset_name, namespace=_namespace)
|
dataset_name=_dataset_name, namespace=_namespace)
|
||||||
|
|
||||||
revision: str = revision or 'master'
|
revision: str = revision or DEFAULT_DATASET_REVISION
|
||||||
data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
|
data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
|
||||||
revision=revision,
|
revision=revision,
|
||||||
files_metadata=files_metadata,
|
files_metadata=files_metadata,
|
||||||
@@ -234,7 +235,7 @@ def _list_repo_tree(
|
|||||||
while True:
|
while True:
|
||||||
data: dict = _api.list_repo_tree(dataset_name=_dataset_name,
|
data: dict = _api.list_repo_tree(dataset_name=_dataset_name,
|
||||||
namespace=_namespace,
|
namespace=_namespace,
|
||||||
revision=revision or 'master',
|
revision=revision or DEFAULT_DATASET_REVISION,
|
||||||
root_path=path_in_repo or None,
|
root_path=path_in_repo or None,
|
||||||
recursive=True,
|
recursive=True,
|
||||||
page_number=page_number,
|
page_number=page_number,
|
||||||
@@ -277,7 +278,7 @@ def _get_paths_info(
|
|||||||
dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
|
dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
|
||||||
dataset_name=_dataset_name, namespace=_namespace)
|
dataset_name=_dataset_name, namespace=_namespace)
|
||||||
|
|
||||||
revision: str = revision or 'master'
|
revision: str = revision or DEFAULT_DATASET_REVISION
|
||||||
data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
|
data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
|
||||||
revision=revision,
|
revision=revision,
|
||||||
files_metadata=False,
|
files_metadata=False,
|
||||||
@@ -296,6 +297,29 @@ def _get_paths_info(
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _download_repo_file(repo_id: str, path_in_repo: str, download_config: DownloadConfig, revision: str):
|
||||||
|
_api = HubApi()
|
||||||
|
_namespace, _dataset_name = repo_id.split('/')
|
||||||
|
|
||||||
|
if download_config and download_config.download_desc is None:
|
||||||
|
download_config.download_desc = f'Downloading [{path_in_repo}]'
|
||||||
|
try:
|
||||||
|
url_or_filename = _api.get_dataset_file_url(
|
||||||
|
file_name=path_in_repo,
|
||||||
|
dataset_name=_dataset_name,
|
||||||
|
namespace=_namespace,
|
||||||
|
revision=revision,
|
||||||
|
extension_filter=False,
|
||||||
|
)
|
||||||
|
repo_file_path = cached_path(
|
||||||
|
url_or_filename=url_or_filename, download_config=download_config)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
repo_file_path = ''
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
return repo_file_path
|
||||||
|
|
||||||
|
|
||||||
def get_fs_token_paths(
|
def get_fs_token_paths(
|
||||||
urlpath,
|
urlpath,
|
||||||
storage_options=None,
|
storage_options=None,
|
||||||
@@ -536,9 +560,6 @@ def _get_data_patterns(
|
|||||||
|
|
||||||
|
|
||||||
def get_module_without_script(self) -> DatasetModule:
|
def get_module_without_script(self) -> DatasetModule:
|
||||||
_ms_api = HubApi()
|
|
||||||
_repo_id: str = self.name
|
|
||||||
_namespace, _dataset_name = _repo_id.split('/')
|
|
||||||
|
|
||||||
# hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
|
# hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
|
||||||
# self.name,
|
# self.name,
|
||||||
@@ -549,28 +570,20 @@ def get_module_without_script(self) -> DatasetModule:
|
|||||||
# even if metadata_configs is not None (which means that we will resolve files for each config later)
|
# even if metadata_configs is not None (which means that we will resolve files for each config later)
|
||||||
# we cannot skip resolving all files because we need to infer module name by files extensions
|
# we cannot skip resolving all files because we need to infer module name by files extensions
|
||||||
# revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime
|
# revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime
|
||||||
revision = self.revision or 'master'
|
revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION
|
||||||
base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip(
|
base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip(
|
||||||
'/')
|
'/')
|
||||||
|
|
||||||
|
repo_id: str = self.name
|
||||||
download_config = self.download_config.copy()
|
download_config = self.download_config.copy()
|
||||||
if download_config.download_desc is None:
|
|
||||||
download_config.download_desc = 'Downloading [README.md]'
|
|
||||||
try:
|
|
||||||
url_or_filename = _ms_api.get_dataset_file_url(
|
|
||||||
file_name='README.md',
|
|
||||||
dataset_name=_dataset_name,
|
|
||||||
namespace=_namespace,
|
|
||||||
revision=revision,
|
|
||||||
extension_filter=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
dataset_readme_path = cached_path(
|
dataset_readme_path = _download_repo_file(
|
||||||
url_or_filename=url_or_filename, download_config=download_config)
|
repo_id=repo_id,
|
||||||
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
|
path_in_repo='README.md',
|
||||||
except FileNotFoundError:
|
download_config=download_config,
|
||||||
dataset_card_data = DatasetCardData()
|
revision=revision)
|
||||||
|
|
||||||
|
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data if dataset_readme_path else DatasetCardData()
|
||||||
subset_name: str = download_config.storage_options.get('name', None)
|
subset_name: str = download_config.storage_options.get('name', None)
|
||||||
|
|
||||||
metadata_configs = MetadataConfigs.from_dataset_card_data(
|
metadata_configs = MetadataConfigs.from_dataset_card_data(
|
||||||
@@ -646,10 +659,7 @@ def get_module_without_script(self) -> DatasetModule:
|
|||||||
builder_kwargs = {
|
builder_kwargs = {
|
||||||
# "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
|
# "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
|
||||||
'base_path':
|
'base_path':
|
||||||
_ms_api.get_file_base_path(
|
HubApi().get_file_base_path(repo_id=repo_id),
|
||||||
namespace=_namespace,
|
|
||||||
dataset_name=_dataset_name,
|
|
||||||
),
|
|
||||||
'repo_id':
|
'repo_id':
|
||||||
self.name,
|
self.name,
|
||||||
'dataset_name':
|
'dataset_name':
|
||||||
@@ -760,20 +770,22 @@ def _download_additional_modules(
|
|||||||
|
|
||||||
def get_module_with_script(self) -> DatasetModule:
|
def get_module_with_script(self) -> DatasetModule:
|
||||||
|
|
||||||
_api = HubApi()
|
repo_id: str = self.name
|
||||||
_dataset_name: str = self.name.split('/')[-1]
|
_namespace, _dataset_name = repo_id.split('/')
|
||||||
_namespace: str = self.name.split('/')[0]
|
revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION
|
||||||
|
|
||||||
script_file_name = f'{_dataset_name}.py'
|
script_file_name = f'{_dataset_name}.py'
|
||||||
script_url: str = _api.get_dataset_file_url(
|
local_script_path = _download_repo_file(
|
||||||
file_name=script_file_name,
|
repo_id=repo_id,
|
||||||
dataset_name=_dataset_name,
|
path_in_repo=script_file_name,
|
||||||
namespace=_namespace,
|
download_config=self.download_config,
|
||||||
revision=self.revision,
|
revision=revision,
|
||||||
extension_filter=False,
|
|
||||||
)
|
)
|
||||||
local_script_path = cached_path(
|
if not local_script_path:
|
||||||
url_or_filename=script_url, download_config=self.download_config)
|
raise FileNotFoundError(
|
||||||
|
f'Cannot find {script_file_name} in {repo_id} at revision {revision}. '
|
||||||
|
f'Please create {script_file_name} in the repo.'
|
||||||
|
)
|
||||||
|
|
||||||
dataset_infos_path = None
|
dataset_infos_path = None
|
||||||
# try:
|
# try:
|
||||||
@@ -790,22 +802,19 @@ def get_module_with_script(self) -> DatasetModule:
|
|||||||
# logger.info(f'Cannot find dataset_infos.json: {e}')
|
# logger.info(f'Cannot find dataset_infos.json: {e}')
|
||||||
# dataset_infos_path = None
|
# dataset_infos_path = None
|
||||||
|
|
||||||
dataset_readme_url: str = _api.get_dataset_file_url(
|
dataset_readme_path = _download_repo_file(
|
||||||
file_name='README.md',
|
repo_id=repo_id,
|
||||||
dataset_name=_dataset_name,
|
path_in_repo='README.md',
|
||||||
namespace=_namespace,
|
download_config=self.download_config,
|
||||||
revision=self.revision,
|
revision=revision
|
||||||
extension_filter=False,
|
|
||||||
)
|
)
|
||||||
dataset_readme_path = cached_path(
|
|
||||||
url_or_filename=dataset_readme_url, download_config=self.download_config)
|
|
||||||
|
|
||||||
imports = get_imports(local_script_path)
|
imports = get_imports(local_script_path)
|
||||||
local_imports = _download_additional_modules(
|
local_imports = _download_additional_modules(
|
||||||
name=self.name,
|
name=repo_id,
|
||||||
dataset_name=_dataset_name,
|
dataset_name=_dataset_name,
|
||||||
namespace=_namespace,
|
namespace=_namespace,
|
||||||
revision=self.revision,
|
revision=revision,
|
||||||
imports=imports,
|
imports=imports,
|
||||||
download_config=self.download_config,
|
download_config=self.download_config,
|
||||||
)
|
)
|
||||||
@@ -821,7 +830,7 @@ def get_module_with_script(self) -> DatasetModule:
|
|||||||
dynamic_modules_path=dynamic_modules_path,
|
dynamic_modules_path=dynamic_modules_path,
|
||||||
module_namespace='datasets',
|
module_namespace='datasets',
|
||||||
subdirectory_name=hash,
|
subdirectory_name=hash,
|
||||||
name=self.name,
|
name=repo_id,
|
||||||
)
|
)
|
||||||
if not os.path.exists(importable_file_path):
|
if not os.path.exists(importable_file_path):
|
||||||
trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
|
trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
|
||||||
@@ -833,12 +842,12 @@ def get_module_with_script(self) -> DatasetModule:
|
|||||||
dynamic_modules_path=dynamic_modules_path,
|
dynamic_modules_path=dynamic_modules_path,
|
||||||
module_namespace='datasets',
|
module_namespace='datasets',
|
||||||
subdirectory_name=hash,
|
subdirectory_name=hash,
|
||||||
name=self.name,
|
name=repo_id,
|
||||||
download_mode=self.download_mode,
|
download_mode=self.download_mode,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f'Loading {self.name} requires you to execute the dataset script in that'
|
f'Loading {repo_id} requires you to execute the dataset script in that'
|
||||||
' repo on your local machine. Make sure you have read the code there to avoid malicious use, then'
|
' repo on your local machine. Make sure you have read the code there to avoid malicious use, then'
|
||||||
' set the option `trust_remote_code=True` to remove this error.'
|
' set the option `trust_remote_code=True` to remove this error.'
|
||||||
)
|
)
|
||||||
@@ -846,14 +855,14 @@ def get_module_with_script(self) -> DatasetModule:
|
|||||||
dynamic_modules_path=dynamic_modules_path,
|
dynamic_modules_path=dynamic_modules_path,
|
||||||
module_namespace='datasets',
|
module_namespace='datasets',
|
||||||
subdirectory_name=hash,
|
subdirectory_name=hash,
|
||||||
name=self.name,
|
name=repo_id,
|
||||||
)
|
)
|
||||||
# make the new module to be noticed by the import system
|
# make the new module to be noticed by the import system
|
||||||
importlib.invalidate_caches()
|
importlib.invalidate_caches()
|
||||||
builder_kwargs = {
|
builder_kwargs = {
|
||||||
# "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
|
# "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
|
||||||
'base_path': _api.get_file_base_path(namespace=_namespace, dataset_name=_dataset_name),
|
'base_path': HubApi().get_file_base_path(repo_id=repo_id),
|
||||||
'repo_id': self.name,
|
'repo_id': repo_id,
|
||||||
}
|
}
|
||||||
return DatasetModule(module_path, hash, builder_kwargs)
|
return DatasetModule(module_path, hash, builder_kwargs)
|
||||||
|
|
||||||
@@ -1126,9 +1135,11 @@ class DatasetsWrapperHF:
|
|||||||
) -> DatasetModule:
|
) -> DatasetModule:
|
||||||
|
|
||||||
subset_name: str = download_kwargs.pop('name', None)
|
subset_name: str = download_kwargs.pop('name', None)
|
||||||
|
revision = revision or DEFAULT_DATASET_REVISION
|
||||||
if download_config is None:
|
if download_config is None:
|
||||||
download_config = DownloadConfig(**download_kwargs)
|
download_config = DownloadConfig(**download_kwargs)
|
||||||
download_config.storage_options.update({'name': subset_name})
|
download_config.storage_options.update({'name': subset_name})
|
||||||
|
download_config.storage_options.update({'revision': revision})
|
||||||
|
|
||||||
if download_config and download_config.cache_dir is None:
|
if download_config and download_config.cache_dir is None:
|
||||||
download_config.cache_dir = MS_DATASETS_CACHE
|
download_config.cache_dir = MS_DATASETS_CACHE
|
||||||
@@ -1197,7 +1208,7 @@ class DatasetsWrapperHF:
|
|||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
download_mode=download_mode).get_module()
|
download_mode=download_mode).get_module()
|
||||||
# Try remotely
|
# Try remotely
|
||||||
elif is_relative_path(path) and path.count('/') <= 1:
|
elif is_relative_path(path) and path.count('/') == 1:
|
||||||
try:
|
try:
|
||||||
_raise_if_offline_mode_is_enabled()
|
_raise_if_offline_mode_is_enabled()
|
||||||
|
|
||||||
@@ -1236,6 +1247,15 @@ class DatasetsWrapperHF:
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
dataset_readme_path = _download_repo_file(
|
||||||
|
repo_id=path,
|
||||||
|
path_in_repo='README.md',
|
||||||
|
download_config=download_config,
|
||||||
|
revision=revision,
|
||||||
|
)
|
||||||
|
commit_hash = os.path.basename(os.path.dirname(dataset_readme_path))
|
||||||
|
|
||||||
if filename in [
|
if filename in [
|
||||||
sibling.rfilename for sibling in dataset_info.siblings
|
sibling.rfilename for sibling in dataset_info.siblings
|
||||||
]: # contains a dataset script
|
]: # contains a dataset script
|
||||||
@@ -1264,26 +1284,54 @@ class DatasetsWrapperHF:
|
|||||||
# This fails when the dataset has multiple configs and a default config and
|
# This fails when the dataset has multiple configs and a default config and
|
||||||
# the user didn't specify a configuration name (_require_default_config_name=True).
|
# the user didn't specify a configuration name (_require_default_config_name=True).
|
||||||
try:
|
try:
|
||||||
|
if has_attr_in_class(HubDatasetModuleFactoryWithParquetExport, 'revision'):
|
||||||
|
return HubDatasetModuleFactoryWithParquetExport(
|
||||||
|
path,
|
||||||
|
revision=revision,
|
||||||
|
download_config=download_config).get_module()
|
||||||
|
|
||||||
return HubDatasetModuleFactoryWithParquetExport(
|
return HubDatasetModuleFactoryWithParquetExport(
|
||||||
path,
|
path,
|
||||||
download_config=download_config,
|
commit_hash=commit_hash,
|
||||||
revision=dataset_info.sha).get_module()
|
download_config=download_config).get_module()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
|
|
||||||
# Otherwise we must use the dataset script if the user trusts it
|
# Otherwise we must use the dataset script if the user trusts it
|
||||||
|
# To be adapted to the old version of datasets
|
||||||
|
if has_attr_in_class(HubDatasetModuleFactoryWithScript, 'revision'):
|
||||||
|
return HubDatasetModuleFactoryWithScript(
|
||||||
|
path,
|
||||||
|
revision=revision,
|
||||||
|
download_config=download_config,
|
||||||
|
download_mode=download_mode,
|
||||||
|
dynamic_modules_path=dynamic_modules_path,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
).get_module()
|
||||||
|
|
||||||
return HubDatasetModuleFactoryWithScript(
|
return HubDatasetModuleFactoryWithScript(
|
||||||
path,
|
path,
|
||||||
revision=revision,
|
commit_hash=commit_hash,
|
||||||
download_config=download_config,
|
download_config=download_config,
|
||||||
download_mode=download_mode,
|
download_mode=download_mode,
|
||||||
dynamic_modules_path=dynamic_modules_path,
|
dynamic_modules_path=dynamic_modules_path,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
).get_module()
|
).get_module()
|
||||||
else:
|
else:
|
||||||
|
# To be adapted to the old version of datasets
|
||||||
|
if has_attr_in_class(HubDatasetModuleFactoryWithoutScript, 'revision'):
|
||||||
|
return HubDatasetModuleFactoryWithoutScript(
|
||||||
|
path,
|
||||||
|
revision=revision,
|
||||||
|
data_dir=data_dir,
|
||||||
|
data_files=data_files,
|
||||||
|
download_config=download_config,
|
||||||
|
download_mode=download_mode,
|
||||||
|
).get_module()
|
||||||
|
|
||||||
return HubDatasetModuleFactoryWithoutScript(
|
return HubDatasetModuleFactoryWithoutScript(
|
||||||
path,
|
path,
|
||||||
revision=revision,
|
commit_hash=commit_hash,
|
||||||
data_dir=data_dir,
|
data_dir=data_dir,
|
||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
download_config=download_config,
|
download_config=download_config,
|
||||||
@@ -1292,6 +1340,7 @@ class DatasetsWrapperHF:
|
|||||||
except Exception as e1:
|
except Exception as e1:
|
||||||
# All the attempts failed, before raising the error we should check if the module is already cached
|
# All the attempts failed, before raising the error we should check if the module is already cached
|
||||||
logger.error(f'>> Error loading {path}: {e1}')
|
logger.error(f'>> Error loading {path}: {e1}')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return CachedDatasetModuleFactory(
|
return CachedDatasetModuleFactory(
|
||||||
path,
|
path,
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ from __future__ import print_function
|
|||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import oss2
|
|
||||||
from datasets.utils.file_utils import hash_url_to_filename
|
from datasets.utils.file_utils import hash_url_to_filename
|
||||||
|
|
||||||
from modelscope.hub.api import HubApi
|
from modelscope.hub.api import HubApi
|
||||||
@@ -40,6 +39,7 @@ class OssUtilities:
|
|||||||
self.multipart_threshold = 50 * 1024 * 1024
|
self.multipart_threshold = 50 * 1024 * 1024
|
||||||
self.max_retries = 3
|
self.max_retries = 3
|
||||||
|
|
||||||
|
import oss2
|
||||||
self.resumable_store_download = oss2.ResumableDownloadStore(
|
self.resumable_store_download = oss2.ResumableDownloadStore(
|
||||||
root=self.resumable_store_root_path)
|
root=self.resumable_store_root_path)
|
||||||
self.resumable_store_upload = oss2.ResumableStore(
|
self.resumable_store_upload = oss2.ResumableStore(
|
||||||
@@ -47,6 +47,8 @@ class OssUtilities:
|
|||||||
self.api = HubApi()
|
self.api = HubApi()
|
||||||
|
|
||||||
def _do_init(self, oss_config):
|
def _do_init(self, oss_config):
|
||||||
|
import oss2
|
||||||
|
|
||||||
self.key = oss_config[ACCESS_ID]
|
self.key = oss_config[ACCESS_ID]
|
||||||
self.secret = oss_config[ACCESS_SECRET]
|
self.secret = oss_config[ACCESS_SECRET]
|
||||||
self.token = oss_config[SECURITY_TOKEN]
|
self.token = oss_config[SECURITY_TOKEN]
|
||||||
@@ -78,6 +80,7 @@ class OssUtilities:
|
|||||||
|
|
||||||
def download(self, oss_file_name: str,
|
def download(self, oss_file_name: str,
|
||||||
download_config: DataDownloadConfig):
|
download_config: DataDownloadConfig):
|
||||||
|
import oss2
|
||||||
cache_dir = download_config.cache_dir
|
cache_dir = download_config.cache_dir
|
||||||
candidate_key = os.path.join(self.oss_dir, oss_file_name)
|
candidate_key = os.path.join(self.oss_dir, oss_file_name)
|
||||||
candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
|
candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
|
||||||
@@ -126,6 +129,7 @@ class OssUtilities:
|
|||||||
def upload(self, oss_object_name: str, local_file_path: str,
|
def upload(self, oss_object_name: str, local_file_path: str,
|
||||||
indicate_individual_progress: bool,
|
indicate_individual_progress: bool,
|
||||||
upload_mode: UploadMode) -> str:
|
upload_mode: UploadMode) -> str:
|
||||||
|
import oss2
|
||||||
retry_count = 0
|
retry_count = 0
|
||||||
object_key = os.path.join(self.oss_dir, oss_object_name)
|
object_key = os.path.join(self.oss_dir, oss_object_name)
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
import ast
|
import ast
|
||||||
import functools
|
import functools
|
||||||
import importlib
|
import importlib
|
||||||
|
import inspect
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
@@ -480,3 +481,23 @@ class LazyImportModule(ModuleType):
|
|||||||
importlib.import_module(module_name)
|
importlib.import_module(module_name)
|
||||||
else:
|
else:
|
||||||
logger.warning(f'{signature} not found in ast index file')
|
logger.warning(f'{signature} not found in ast index file')
|
||||||
|
|
||||||
|
|
||||||
|
def has_attr_in_class(cls, attribute_name) -> bool:
|
||||||
|
"""
|
||||||
|
Determine if attribute in specific class.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cls: target class.
|
||||||
|
attribute_name: the attribute name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The attribute in the class or not.
|
||||||
|
"""
|
||||||
|
init_method = cls.__init__
|
||||||
|
signature = inspect.signature(init_method)
|
||||||
|
|
||||||
|
parameters = signature.parameters
|
||||||
|
param_names = list(parameters.keys())
|
||||||
|
|
||||||
|
return attribute_name in param_names
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
addict
|
addict
|
||||||
attrs
|
attrs
|
||||||
datasets>=3.0.0,<=3.0.1
|
datasets>=3.0.0,<=3.2.0
|
||||||
einops
|
einops
|
||||||
oss2
|
oss2
|
||||||
Pillow
|
Pillow
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
addict
|
addict
|
||||||
attrs
|
attrs
|
||||||
datasets>=3.0.0,<=3.0.1
|
datasets>=3.0.0,<=3.2.0
|
||||||
einops
|
einops
|
||||||
oss2
|
|
||||||
Pillow
|
Pillow
|
||||||
python-dateutil>=2.1
|
python-dateutil>=2.1
|
||||||
scipy
|
scipy
|
||||||
|
|||||||
Reference in New Issue
Block a user