support latest datasets version (#1163)

This commit is contained in:
Xingjun.Wang
2025-01-02 17:06:46 +08:00
committed by GitHub
parent d257fdeac6
commit 71a54c1a51
6 changed files with 143 additions and 69 deletions

View File

@@ -1168,8 +1168,9 @@ class HubApi:
return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex), return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
**headers} **headers}
def get_file_base_path(self, namespace: str, dataset_name: str) -> str: def get_file_base_path(self, repo_id: str) -> str:
return f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' _namespace, _dataset_name = repo_id.split('/')
return f'{self.endpoint}/api/v1/datasets/{_namespace}/{_dataset_name}/repo?'
# return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath=' # return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='

View File

@@ -41,19 +41,19 @@ from datasets.packaged_modules import (_EXTENSION_TO_MODULE,
_MODULE_TO_EXTENSIONS, _MODULE_TO_EXTENSIONS,
_PACKAGED_DATASETS_MODULES) _PACKAGED_DATASETS_MODULES)
from datasets.utils import file_utils from datasets.utils import file_utils
from datasets.utils.file_utils import (OfflineModeIsEnabled, from datasets.utils.file_utils import (_raise_if_offline_mode_is_enabled,
_raise_if_offline_mode_is_enabled,
cached_path, is_local_path, cached_path, is_local_path,
is_relative_path, is_relative_path,
relative_to_absolute_path) relative_to_absolute_path)
from datasets.utils.info_utils import is_small_dataset from datasets.utils.info_utils import is_small_dataset
from datasets.utils.metadata import MetadataConfigs from datasets.utils.metadata import MetadataConfigs
from datasets.utils.py_utils import get_imports, map_nested from datasets.utils.py_utils import get_imports
from datasets.utils.track import tracked_str from datasets.utils.track import tracked_str
from fsspec import filesystem from fsspec import filesystem
from fsspec.core import _un_chain from fsspec.core import _un_chain
from fsspec.utils import stringify_path from fsspec.utils import stringify_path
from huggingface_hub import (DatasetCard, DatasetCardData) from huggingface_hub import (DatasetCard, DatasetCardData)
from huggingface_hub.errors import OfflineModeIsEnabled
from huggingface_hub.hf_api import DatasetInfo as HfDatasetInfo from huggingface_hub.hf_api import DatasetInfo as HfDatasetInfo
from huggingface_hub.hf_api import HfApi, RepoFile, RepoFolder from huggingface_hub.hf_api import HfApi, RepoFile, RepoFolder
from packaging import version from packaging import version
@@ -62,7 +62,8 @@ from modelscope import HubApi
from modelscope.hub.utils.utils import get_endpoint from modelscope.hub.utils.utils import get_endpoint
from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms
from modelscope.utils.config_ds import MS_DATASETS_CACHE from modelscope.utils.config_ds import MS_DATASETS_CACHE
from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DEFAULT_DATASET_REVISION
from modelscope.utils.import_utils import has_attr_in_class
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
logger = get_logger() logger = get_logger()
@@ -97,7 +98,7 @@ def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) ->
if is_relative_path(url_or_filename): if is_relative_path(url_or_filename):
# append the relative path to the base_path # append the relative path to the base_path
# url_or_filename = url_or_path_join(self._base_path, url_or_filename) # url_or_filename = url_or_path_join(self._base_path, url_or_filename)
revision = revision or 'master' revision = revision or DEFAULT_DATASET_REVISION
# Note: make sure the FilePath is the last param # Note: make sure the FilePath is the last param
params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': url_or_filename} params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': url_or_filename}
params: str = urlencode(params) params: str = urlencode(params)
@@ -162,7 +163,7 @@ def _dataset_info(
dataset_hub_id, dataset_type = _api.get_dataset_id_and_type( dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
dataset_name=_dataset_name, namespace=_namespace) dataset_name=_dataset_name, namespace=_namespace)
revision: str = revision or 'master' revision: str = revision or DEFAULT_DATASET_REVISION
data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id, data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
revision=revision, revision=revision,
files_metadata=files_metadata, files_metadata=files_metadata,
@@ -234,7 +235,7 @@ def _list_repo_tree(
while True: while True:
data: dict = _api.list_repo_tree(dataset_name=_dataset_name, data: dict = _api.list_repo_tree(dataset_name=_dataset_name,
namespace=_namespace, namespace=_namespace,
revision=revision or 'master', revision=revision or DEFAULT_DATASET_REVISION,
root_path=path_in_repo or None, root_path=path_in_repo or None,
recursive=True, recursive=True,
page_number=page_number, page_number=page_number,
@@ -277,7 +278,7 @@ def _get_paths_info(
dataset_hub_id, dataset_type = _api.get_dataset_id_and_type( dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
dataset_name=_dataset_name, namespace=_namespace) dataset_name=_dataset_name, namespace=_namespace)
revision: str = revision or 'master' revision: str = revision or DEFAULT_DATASET_REVISION
data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id, data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
revision=revision, revision=revision,
files_metadata=False, files_metadata=False,
@@ -296,6 +297,29 @@ def _get_paths_info(
] ]
def _download_repo_file(repo_id: str, path_in_repo: str, download_config: DownloadConfig, revision: str):
_api = HubApi()
_namespace, _dataset_name = repo_id.split('/')
if download_config and download_config.download_desc is None:
download_config.download_desc = f'Downloading [{path_in_repo}]'
try:
url_or_filename = _api.get_dataset_file_url(
file_name=path_in_repo,
dataset_name=_dataset_name,
namespace=_namespace,
revision=revision,
extension_filter=False,
)
repo_file_path = cached_path(
url_or_filename=url_or_filename, download_config=download_config)
except FileNotFoundError as e:
repo_file_path = ''
logger.error(e)
return repo_file_path
def get_fs_token_paths( def get_fs_token_paths(
urlpath, urlpath,
storage_options=None, storage_options=None,
@@ -536,9 +560,6 @@ def _get_data_patterns(
def get_module_without_script(self) -> DatasetModule: def get_module_without_script(self) -> DatasetModule:
_ms_api = HubApi()
_repo_id: str = self.name
_namespace, _dataset_name = _repo_id.split('/')
# hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info( # hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
# self.name, # self.name,
@@ -549,28 +570,20 @@ def get_module_without_script(self) -> DatasetModule:
# even if metadata_configs is not None (which means that we will resolve files for each config later) # even if metadata_configs is not None (which means that we will resolve files for each config later)
# we cannot skip resolving all files because we need to infer module name by files extensions # we cannot skip resolving all files because we need to infer module name by files extensions
# revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime # revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime
revision = self.revision or 'master' revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION
base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip( base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip(
'/') '/')
repo_id: str = self.name
download_config = self.download_config.copy() download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = 'Downloading [README.md]'
try:
url_or_filename = _ms_api.get_dataset_file_url(
file_name='README.md',
dataset_name=_dataset_name,
namespace=_namespace,
revision=revision,
extension_filter=False,
)
dataset_readme_path = cached_path( dataset_readme_path = _download_repo_file(
url_or_filename=url_or_filename, download_config=download_config) repo_id=repo_id,
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data path_in_repo='README.md',
except FileNotFoundError: download_config=download_config,
dataset_card_data = DatasetCardData() revision=revision)
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data if dataset_readme_path else DatasetCardData()
subset_name: str = download_config.storage_options.get('name', None) subset_name: str = download_config.storage_options.get('name', None)
metadata_configs = MetadataConfigs.from_dataset_card_data( metadata_configs = MetadataConfigs.from_dataset_card_data(
@@ -646,10 +659,7 @@ def get_module_without_script(self) -> DatasetModule:
builder_kwargs = { builder_kwargs = {
# "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"), # "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
'base_path': 'base_path':
_ms_api.get_file_base_path( HubApi().get_file_base_path(repo_id=repo_id),
namespace=_namespace,
dataset_name=_dataset_name,
),
'repo_id': 'repo_id':
self.name, self.name,
'dataset_name': 'dataset_name':
@@ -760,20 +770,22 @@ def _download_additional_modules(
def get_module_with_script(self) -> DatasetModule: def get_module_with_script(self) -> DatasetModule:
_api = HubApi() repo_id: str = self.name
_dataset_name: str = self.name.split('/')[-1] _namespace, _dataset_name = repo_id.split('/')
_namespace: str = self.name.split('/')[0] revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION
script_file_name = f'{_dataset_name}.py' script_file_name = f'{_dataset_name}.py'
script_url: str = _api.get_dataset_file_url( local_script_path = _download_repo_file(
file_name=script_file_name, repo_id=repo_id,
dataset_name=_dataset_name, path_in_repo=script_file_name,
namespace=_namespace, download_config=self.download_config,
revision=self.revision, revision=revision,
extension_filter=False,
) )
local_script_path = cached_path( if not local_script_path:
url_or_filename=script_url, download_config=self.download_config) raise FileNotFoundError(
f'Cannot find {script_file_name} in {repo_id} at revision {revision}. '
f'Please create {script_file_name} in the repo.'
)
dataset_infos_path = None dataset_infos_path = None
# try: # try:
@@ -790,22 +802,19 @@ def get_module_with_script(self) -> DatasetModule:
# logger.info(f'Cannot find dataset_infos.json: {e}') # logger.info(f'Cannot find dataset_infos.json: {e}')
# dataset_infos_path = None # dataset_infos_path = None
dataset_readme_url: str = _api.get_dataset_file_url( dataset_readme_path = _download_repo_file(
file_name='README.md', repo_id=repo_id,
dataset_name=_dataset_name, path_in_repo='README.md',
namespace=_namespace, download_config=self.download_config,
revision=self.revision, revision=revision
extension_filter=False,
) )
dataset_readme_path = cached_path(
url_or_filename=dataset_readme_url, download_config=self.download_config)
imports = get_imports(local_script_path) imports = get_imports(local_script_path)
local_imports = _download_additional_modules( local_imports = _download_additional_modules(
name=self.name, name=repo_id,
dataset_name=_dataset_name, dataset_name=_dataset_name,
namespace=_namespace, namespace=_namespace,
revision=self.revision, revision=revision,
imports=imports, imports=imports,
download_config=self.download_config, download_config=self.download_config,
) )
@@ -821,7 +830,7 @@ def get_module_with_script(self) -> DatasetModule:
dynamic_modules_path=dynamic_modules_path, dynamic_modules_path=dynamic_modules_path,
module_namespace='datasets', module_namespace='datasets',
subdirectory_name=hash, subdirectory_name=hash,
name=self.name, name=repo_id,
) )
if not os.path.exists(importable_file_path): if not os.path.exists(importable_file_path):
trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name) trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
@@ -833,12 +842,12 @@ def get_module_with_script(self) -> DatasetModule:
dynamic_modules_path=dynamic_modules_path, dynamic_modules_path=dynamic_modules_path,
module_namespace='datasets', module_namespace='datasets',
subdirectory_name=hash, subdirectory_name=hash,
name=self.name, name=repo_id,
download_mode=self.download_mode, download_mode=self.download_mode,
) )
else: else:
raise ValueError( raise ValueError(
f'Loading {self.name} requires you to execute the dataset script in that' f'Loading {repo_id} requires you to execute the dataset script in that'
' repo on your local machine. Make sure you have read the code there to avoid malicious use, then' ' repo on your local machine. Make sure you have read the code there to avoid malicious use, then'
' set the option `trust_remote_code=True` to remove this error.' ' set the option `trust_remote_code=True` to remove this error.'
) )
@@ -846,14 +855,14 @@ def get_module_with_script(self) -> DatasetModule:
dynamic_modules_path=dynamic_modules_path, dynamic_modules_path=dynamic_modules_path,
module_namespace='datasets', module_namespace='datasets',
subdirectory_name=hash, subdirectory_name=hash,
name=self.name, name=repo_id,
) )
# make the new module to be noticed by the import system # make the new module to be noticed by the import system
importlib.invalidate_caches() importlib.invalidate_caches()
builder_kwargs = { builder_kwargs = {
# "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"), # "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
'base_path': _api.get_file_base_path(namespace=_namespace, dataset_name=_dataset_name), 'base_path': HubApi().get_file_base_path(repo_id=repo_id),
'repo_id': self.name, 'repo_id': repo_id,
} }
return DatasetModule(module_path, hash, builder_kwargs) return DatasetModule(module_path, hash, builder_kwargs)
@@ -1126,9 +1135,11 @@ class DatasetsWrapperHF:
) -> DatasetModule: ) -> DatasetModule:
subset_name: str = download_kwargs.pop('name', None) subset_name: str = download_kwargs.pop('name', None)
revision = revision or DEFAULT_DATASET_REVISION
if download_config is None: if download_config is None:
download_config = DownloadConfig(**download_kwargs) download_config = DownloadConfig(**download_kwargs)
download_config.storage_options.update({'name': subset_name}) download_config.storage_options.update({'name': subset_name})
download_config.storage_options.update({'revision': revision})
if download_config and download_config.cache_dir is None: if download_config and download_config.cache_dir is None:
download_config.cache_dir = MS_DATASETS_CACHE download_config.cache_dir = MS_DATASETS_CACHE
@@ -1197,7 +1208,7 @@ class DatasetsWrapperHF:
data_files=data_files, data_files=data_files,
download_mode=download_mode).get_module() download_mode=download_mode).get_module()
# Try remotely # Try remotely
elif is_relative_path(path) and path.count('/') <= 1: elif is_relative_path(path) and path.count('/') == 1:
try: try:
_raise_if_offline_mode_is_enabled() _raise_if_offline_mode_is_enabled()
@@ -1236,6 +1247,15 @@ class DatasetsWrapperHF:
) )
else: else:
raise e raise e
dataset_readme_path = _download_repo_file(
repo_id=path,
path_in_repo='README.md',
download_config=download_config,
revision=revision,
)
commit_hash = os.path.basename(os.path.dirname(dataset_readme_path))
if filename in [ if filename in [
sibling.rfilename for sibling in dataset_info.siblings sibling.rfilename for sibling in dataset_info.siblings
]: # contains a dataset script ]: # contains a dataset script
@@ -1264,26 +1284,54 @@ class DatasetsWrapperHF:
# This fails when the dataset has multiple configs and a default config and # This fails when the dataset has multiple configs and a default config and
# the user didn't specify a configuration name (_require_default_config_name=True). # the user didn't specify a configuration name (_require_default_config_name=True).
try: try:
if has_attr_in_class(HubDatasetModuleFactoryWithParquetExport, 'revision'):
return HubDatasetModuleFactoryWithParquetExport(
path,
revision=revision,
download_config=download_config).get_module()
return HubDatasetModuleFactoryWithParquetExport( return HubDatasetModuleFactoryWithParquetExport(
path, path,
download_config=download_config, commit_hash=commit_hash,
revision=dataset_info.sha).get_module() download_config=download_config).get_module()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
# Otherwise we must use the dataset script if the user trusts it # Otherwise we must use the dataset script if the user trusts it
# To be adapted to the old version of datasets
if has_attr_in_class(HubDatasetModuleFactoryWithScript, 'revision'):
return HubDatasetModuleFactoryWithScript(
path,
revision=revision,
download_config=download_config,
download_mode=download_mode,
dynamic_modules_path=dynamic_modules_path,
trust_remote_code=trust_remote_code,
).get_module()
return HubDatasetModuleFactoryWithScript( return HubDatasetModuleFactoryWithScript(
path, path,
revision=revision, commit_hash=commit_hash,
download_config=download_config, download_config=download_config,
download_mode=download_mode, download_mode=download_mode,
dynamic_modules_path=dynamic_modules_path, dynamic_modules_path=dynamic_modules_path,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
).get_module() ).get_module()
else: else:
# To be adapted to the old version of datasets
if has_attr_in_class(HubDatasetModuleFactoryWithoutScript, 'revision'):
return HubDatasetModuleFactoryWithoutScript(
path,
revision=revision,
data_dir=data_dir,
data_files=data_files,
download_config=download_config,
download_mode=download_mode,
).get_module()
return HubDatasetModuleFactoryWithoutScript( return HubDatasetModuleFactoryWithoutScript(
path, path,
revision=revision, commit_hash=commit_hash,
data_dir=data_dir, data_dir=data_dir,
data_files=data_files, data_files=data_files,
download_config=download_config, download_config=download_config,
@@ -1292,6 +1340,7 @@ class DatasetsWrapperHF:
except Exception as e1: except Exception as e1:
# All the attempts failed, before raising the error we should check if the module is already cached # All the attempts failed, before raising the error we should check if the module is already cached
logger.error(f'>> Error loading {path}: {e1}') logger.error(f'>> Error loading {path}: {e1}')
try: try:
return CachedDatasetModuleFactory( return CachedDatasetModuleFactory(
path, path,

View File

@@ -4,7 +4,6 @@ from __future__ import print_function
import multiprocessing import multiprocessing
import os import os
import oss2
from datasets.utils.file_utils import hash_url_to_filename from datasets.utils.file_utils import hash_url_to_filename
from modelscope.hub.api import HubApi from modelscope.hub.api import HubApi
@@ -40,6 +39,7 @@ class OssUtilities:
self.multipart_threshold = 50 * 1024 * 1024 self.multipart_threshold = 50 * 1024 * 1024
self.max_retries = 3 self.max_retries = 3
import oss2
self.resumable_store_download = oss2.ResumableDownloadStore( self.resumable_store_download = oss2.ResumableDownloadStore(
root=self.resumable_store_root_path) root=self.resumable_store_root_path)
self.resumable_store_upload = oss2.ResumableStore( self.resumable_store_upload = oss2.ResumableStore(
@@ -47,6 +47,8 @@ class OssUtilities:
self.api = HubApi() self.api = HubApi()
def _do_init(self, oss_config): def _do_init(self, oss_config):
import oss2
self.key = oss_config[ACCESS_ID] self.key = oss_config[ACCESS_ID]
self.secret = oss_config[ACCESS_SECRET] self.secret = oss_config[ACCESS_SECRET]
self.token = oss_config[SECURITY_TOKEN] self.token = oss_config[SECURITY_TOKEN]
@@ -78,6 +80,7 @@ class OssUtilities:
def download(self, oss_file_name: str, def download(self, oss_file_name: str,
download_config: DataDownloadConfig): download_config: DataDownloadConfig):
import oss2
cache_dir = download_config.cache_dir cache_dir = download_config.cache_dir
candidate_key = os.path.join(self.oss_dir, oss_file_name) candidate_key = os.path.join(self.oss_dir, oss_file_name)
candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name) candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
@@ -126,6 +129,7 @@ class OssUtilities:
def upload(self, oss_object_name: str, local_file_path: str, def upload(self, oss_object_name: str, local_file_path: str,
indicate_individual_progress: bool, indicate_individual_progress: bool,
upload_mode: UploadMode) -> str: upload_mode: UploadMode) -> str:
import oss2
retry_count = 0 retry_count = 0
object_key = os.path.join(self.oss_dir, oss_object_name) object_key = os.path.join(self.oss_dir, oss_object_name)

View File

@@ -3,6 +3,7 @@
import ast import ast
import functools import functools
import importlib import importlib
import inspect
import logging import logging
import os import os
import os.path as osp import os.path as osp
@@ -480,3 +481,23 @@ class LazyImportModule(ModuleType):
importlib.import_module(module_name) importlib.import_module(module_name)
else: else:
logger.warning(f'{signature} not found in ast index file') logger.warning(f'{signature} not found in ast index file')
def has_attr_in_class(cls, attribute_name) -> bool:
"""
Determine if attribute in specific class.
Args:
cls: target class.
attribute_name: the attribute name.
Returns:
The attribute in the class or not.
"""
init_method = cls.__init__
signature = inspect.signature(init_method)
parameters = signature.parameters
param_names = list(parameters.keys())
return attribute_name in param_names

View File

@@ -1,6 +1,6 @@
addict addict
attrs attrs
datasets>=3.0.0,<=3.0.1 datasets>=3.0.0,<=3.2.0
einops einops
oss2 oss2
Pillow Pillow

View File

@@ -1,8 +1,7 @@
addict addict
attrs attrs
datasets>=3.0.0,<=3.0.1 datasets>=3.0.0,<=3.2.0
einops einops
oss2
Pillow Pillow
python-dateutil>=2.1 python-dateutil>=2.1
scipy scipy