support latest datasets version (#1163)

2025-12-16 08:17:45 +01:00 · 2025-01-02 17:06:46 +08:00
parent d257fdeac6
commit 71a54c1a51
6 changed files with 143 additions and 69 deletions
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1168,8 +1168,9 @@ class HubApi:
        return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
                **headers}

-    def get_file_base_path(self, namespace: str, dataset_name: str) -> str:
-        return f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'
+    def get_file_base_path(self, repo_id: str) -> str:
+        _namespace, _dataset_name = repo_id.split('/')
+        return f'{self.endpoint}/api/v1/datasets/{_namespace}/{_dataset_name}/repo?'
        # return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='


--- a/modelscope/msdatasets/utils/hf_datasets_util.py
+++ b/modelscope/msdatasets/utils/hf_datasets_util.py
@@ -41,19 +41,19 @@ from datasets.packaged_modules import (_EXTENSION_TO_MODULE,
                                       _MODULE_TO_EXTENSIONS,
                                       _PACKAGED_DATASETS_MODULES)
 from datasets.utils import file_utils
-from datasets.utils.file_utils import (OfflineModeIsEnabled,
-                                       _raise_if_offline_mode_is_enabled,
+from datasets.utils.file_utils import (_raise_if_offline_mode_is_enabled,
                                       cached_path, is_local_path,
                                       is_relative_path,
                                       relative_to_absolute_path)
 from datasets.utils.info_utils import is_small_dataset
 from datasets.utils.metadata import MetadataConfigs
-from datasets.utils.py_utils import get_imports, map_nested
+from datasets.utils.py_utils import get_imports
 from datasets.utils.track import tracked_str
 from fsspec import filesystem
 from fsspec.core import _un_chain
 from fsspec.utils import stringify_path
 from huggingface_hub import (DatasetCard, DatasetCardData)
+from huggingface_hub.errors import OfflineModeIsEnabled
 from huggingface_hub.hf_api import DatasetInfo as HfDatasetInfo
 from huggingface_hub.hf_api import HfApi, RepoFile, RepoFolder
 from packaging import version
@@ -62,7 +62,8 @@ from modelscope import HubApi
 from modelscope.hub.utils.utils import get_endpoint
 from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms
 from modelscope.utils.config_ds import MS_DATASETS_CACHE
-from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE
+from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DEFAULT_DATASET_REVISION
+from modelscope.utils.import_utils import has_attr_in_class
 from modelscope.utils.logger import get_logger

 logger = get_logger()
@@ -97,7 +98,7 @@ def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) ->
    if is_relative_path(url_or_filename):
        # append the relative path to the base_path
        # url_or_filename = url_or_path_join(self._base_path, url_or_filename)
-        revision = revision or 'master'
+        revision = revision or DEFAULT_DATASET_REVISION
        # Note: make sure the FilePath is the last param
        params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': url_or_filename}
        params: str = urlencode(params)
@@ -162,7 +163,7 @@ def _dataset_info(
    dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
        dataset_name=_dataset_name, namespace=_namespace)

-    revision: str = revision or 'master'
+    revision: str = revision or DEFAULT_DATASET_REVISION
    data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
                                  revision=revision,
                                  files_metadata=files_metadata,
@@ -234,7 +235,7 @@ def _list_repo_tree(
    while True:
        data: dict = _api.list_repo_tree(dataset_name=_dataset_name,
                                         namespace=_namespace,
-                                         revision=revision or 'master',
+                                         revision=revision or DEFAULT_DATASET_REVISION,
                                         root_path=path_in_repo or None,
                                         recursive=True,
                                         page_number=page_number,
@@ -277,7 +278,7 @@ def _get_paths_info(
    dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
        dataset_name=_dataset_name, namespace=_namespace)

-    revision: str = revision or 'master'
+    revision: str = revision or DEFAULT_DATASET_REVISION
    data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
                                  revision=revision,
                                  files_metadata=False,
@@ -296,6 +297,29 @@ def _get_paths_info(
    ]


+def _download_repo_file(repo_id: str, path_in_repo: str, download_config: DownloadConfig, revision: str):
+    _api = HubApi()
+    _namespace, _dataset_name = repo_id.split('/')
+
+    if download_config and download_config.download_desc is None:
+        download_config.download_desc = f'Downloading [{path_in_repo}]'
+    try:
+        url_or_filename = _api.get_dataset_file_url(
+            file_name=path_in_repo,
+            dataset_name=_dataset_name,
+            namespace=_namespace,
+            revision=revision,
+            extension_filter=False,
+        )
+        repo_file_path = cached_path(
+            url_or_filename=url_or_filename, download_config=download_config)
+    except FileNotFoundError as e:
+        repo_file_path = ''
+        logger.error(e)
+
+    return repo_file_path
+
+
 def get_fs_token_paths(
    urlpath,
    storage_options=None,
@@ -536,9 +560,6 @@ def _get_data_patterns(


 def get_module_without_script(self) -> DatasetModule:
-    _ms_api = HubApi()
-    _repo_id: str = self.name
-    _namespace, _dataset_name = _repo_id.split('/')

    # hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
    #     self.name,
@@ -549,28 +570,20 @@ def get_module_without_script(self) -> DatasetModule:
    # even if metadata_configs is not None (which means that we will resolve files for each config later)
    # we cannot skip resolving all files because we need to infer module name by files extensions
    # revision = hfh_dataset_info.sha  # fix the revision in case there are new commits in the meantime
-    revision = self.revision or 'master'
+    revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION
    base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip(
        '/')

+    repo_id: str = self.name
    download_config = self.download_config.copy()
-    if download_config.download_desc is None:
-        download_config.download_desc = 'Downloading [README.md]'
-    try:
-        url_or_filename = _ms_api.get_dataset_file_url(
-            file_name='README.md',
-            dataset_name=_dataset_name,
-            namespace=_namespace,
-            revision=revision,
-            extension_filter=False,
-        )

-        dataset_readme_path = cached_path(
-            url_or_filename=url_or_filename, download_config=download_config)
-        dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
-    except FileNotFoundError:
-        dataset_card_data = DatasetCardData()
+    dataset_readme_path = _download_repo_file(
+        repo_id=repo_id,
+        path_in_repo='README.md',
+        download_config=download_config,
+        revision=revision)

+    dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data if dataset_readme_path else DatasetCardData()
    subset_name: str = download_config.storage_options.get('name', None)

    metadata_configs = MetadataConfigs.from_dataset_card_data(
@@ -646,10 +659,7 @@ def get_module_without_script(self) -> DatasetModule:
    builder_kwargs = {
        # "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
        'base_path':
-        _ms_api.get_file_base_path(
-            namespace=_namespace,
-            dataset_name=_dataset_name,
-        ),
+        HubApi().get_file_base_path(repo_id=repo_id),
        'repo_id':
        self.name,
        'dataset_name':
@@ -760,20 +770,22 @@ def _download_additional_modules(

 def get_module_with_script(self) -> DatasetModule:

-    _api = HubApi()
-    _dataset_name: str = self.name.split('/')[-1]
-    _namespace: str = self.name.split('/')[0]
+    repo_id: str = self.name
+    _namespace, _dataset_name = repo_id.split('/')
+    revision = self.download_config.storage_options.get('revision', None) or DEFAULT_DATASET_REVISION

    script_file_name = f'{_dataset_name}.py'
-    script_url: str = _api.get_dataset_file_url(
-        file_name=script_file_name,
-        dataset_name=_dataset_name,
-        namespace=_namespace,
-        revision=self.revision,
-        extension_filter=False,
+    local_script_path = _download_repo_file(
+        repo_id=repo_id,
+        path_in_repo=script_file_name,
+        download_config=self.download_config,
+        revision=revision,
    )
-    local_script_path = cached_path(
-        url_or_filename=script_url, download_config=self.download_config)
+    if not local_script_path:
+        raise FileNotFoundError(
+            f'Cannot find {script_file_name} in {repo_id} at revision {revision}. '
+            f'Please create {script_file_name} in the repo.'
+        )

    dataset_infos_path = None
    # try:
@@ -790,22 +802,19 @@ def get_module_with_script(self) -> DatasetModule:
    #     logger.info(f'Cannot find dataset_infos.json: {e}')
    #     dataset_infos_path = None

-    dataset_readme_url: str = _api.get_dataset_file_url(
-        file_name='README.md',
-        dataset_name=_dataset_name,
-        namespace=_namespace,
-        revision=self.revision,
-        extension_filter=False,
+    dataset_readme_path = _download_repo_file(
+        repo_id=repo_id,
+        path_in_repo='README.md',
+        download_config=self.download_config,
+        revision=revision
    )
-    dataset_readme_path = cached_path(
-        url_or_filename=dataset_readme_url, download_config=self.download_config)

    imports = get_imports(local_script_path)
    local_imports = _download_additional_modules(
-        name=self.name,
+        name=repo_id,
        dataset_name=_dataset_name,
        namespace=_namespace,
-        revision=self.revision,
+        revision=revision,
        imports=imports,
        download_config=self.download_config,
    )
@@ -821,7 +830,7 @@ def get_module_with_script(self) -> DatasetModule:
        dynamic_modules_path=dynamic_modules_path,
        module_namespace='datasets',
        subdirectory_name=hash,
-        name=self.name,
+        name=repo_id,
    )
    if not os.path.exists(importable_file_path):
        trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
@@ -833,12 +842,12 @@ def get_module_with_script(self) -> DatasetModule:
                dynamic_modules_path=dynamic_modules_path,
                module_namespace='datasets',
                subdirectory_name=hash,
-                name=self.name,
+                name=repo_id,
                download_mode=self.download_mode,
            )
        else:
            raise ValueError(
-                f'Loading {self.name} requires you to execute the dataset script in that'
+                f'Loading {repo_id} requires you to execute the dataset script in that'
                ' repo on your local machine. Make sure you have read the code there to avoid malicious use, then'
                ' set the option `trust_remote_code=True` to remove this error.'
            )
@@ -846,14 +855,14 @@ def get_module_with_script(self) -> DatasetModule:
        dynamic_modules_path=dynamic_modules_path,
        module_namespace='datasets',
        subdirectory_name=hash,
-        name=self.name,
+        name=repo_id,
    )
    # make the new module to be noticed by the import system
    importlib.invalidate_caches()
    builder_kwargs = {
        # "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
-        'base_path': _api.get_file_base_path(namespace=_namespace, dataset_name=_dataset_name),
-        'repo_id': self.name,
+        'base_path': HubApi().get_file_base_path(repo_id=repo_id),
+        'repo_id': repo_id,
    }
    return DatasetModule(module_path, hash, builder_kwargs)

@@ -1126,9 +1135,11 @@ class DatasetsWrapperHF:
    ) -> DatasetModule:

        subset_name: str = download_kwargs.pop('name', None)
+        revision = revision or DEFAULT_DATASET_REVISION
        if download_config is None:
            download_config = DownloadConfig(**download_kwargs)
        download_config.storage_options.update({'name': subset_name})
+        download_config.storage_options.update({'revision': revision})

        if download_config and download_config.cache_dir is None:
            download_config.cache_dir = MS_DATASETS_CACHE
@@ -1197,7 +1208,7 @@ class DatasetsWrapperHF:
                data_files=data_files,
                download_mode=download_mode).get_module()
        # Try remotely
-        elif is_relative_path(path) and path.count('/') <= 1:
+        elif is_relative_path(path) and path.count('/') == 1:
            try:
                _raise_if_offline_mode_is_enabled()

@@ -1236,6 +1247,15 @@ class DatasetsWrapperHF:
                        )
                    else:
                        raise e
+
+                dataset_readme_path = _download_repo_file(
+                    repo_id=path,
+                    path_in_repo='README.md',
+                    download_config=download_config,
+                    revision=revision,
+                )
+                commit_hash = os.path.basename(os.path.dirname(dataset_readme_path))
+
                if filename in [
                        sibling.rfilename for sibling in dataset_info.siblings
                ]:  # contains a dataset script
@@ -1264,26 +1284,54 @@ class DatasetsWrapperHF:
                        # This fails when the dataset has multiple configs and a default config and
                        # the user didn't specify a configuration name (_require_default_config_name=True).
                        try:
+                            if has_attr_in_class(HubDatasetModuleFactoryWithParquetExport, 'revision'):
+                                return HubDatasetModuleFactoryWithParquetExport(
+                                    path,
+                                    revision=revision,
+                                    download_config=download_config).get_module()
+
                            return HubDatasetModuleFactoryWithParquetExport(
                                path,
-                                download_config=download_config,
-                                revision=dataset_info.sha).get_module()
+                                commit_hash=commit_hash,
+                                download_config=download_config).get_module()
                        except Exception as e:
                            logger.error(e)

                    # Otherwise we must use the dataset script if the user trusts it
+                    # To be adapted to the old version of datasets
+                    if has_attr_in_class(HubDatasetModuleFactoryWithScript, 'revision'):
+                        return HubDatasetModuleFactoryWithScript(
+                            path,
+                            revision=revision,
+                            download_config=download_config,
+                            download_mode=download_mode,
+                            dynamic_modules_path=dynamic_modules_path,
+                            trust_remote_code=trust_remote_code,
+                        ).get_module()
+
                    return HubDatasetModuleFactoryWithScript(
                        path,
-                        revision=revision,
+                        commit_hash=commit_hash,
                        download_config=download_config,
                        download_mode=download_mode,
                        dynamic_modules_path=dynamic_modules_path,
                        trust_remote_code=trust_remote_code,
                    ).get_module()
                else:
+                    # To be adapted to the old version of datasets
+                    if has_attr_in_class(HubDatasetModuleFactoryWithoutScript, 'revision'):
+                        return HubDatasetModuleFactoryWithoutScript(
+                            path,
+                            revision=revision,
+                            data_dir=data_dir,
+                            data_files=data_files,
+                            download_config=download_config,
+                            download_mode=download_mode,
+                        ).get_module()
+
                    return HubDatasetModuleFactoryWithoutScript(
                        path,
-                        revision=revision,
+                        commit_hash=commit_hash,
                        data_dir=data_dir,
                        data_files=data_files,
                        download_config=download_config,
@@ -1292,6 +1340,7 @@ class DatasetsWrapperHF:
            except Exception as e1:
                # All the attempts failed, before raising the error we should check if the module is already cached
                logger.error(f'>> Error loading {path}: {e1}')
+
                try:
                    return CachedDatasetModuleFactory(
                        path,
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -4,7 +4,6 @@ from __future__ import print_function
 import multiprocessing
 import os

-import oss2
 from datasets.utils.file_utils import hash_url_to_filename

 from modelscope.hub.api import HubApi
@@ -40,6 +39,7 @@ class OssUtilities:
        self.multipart_threshold = 50 * 1024 * 1024
        self.max_retries = 3

+        import oss2
        self.resumable_store_download = oss2.ResumableDownloadStore(
            root=self.resumable_store_root_path)
        self.resumable_store_upload = oss2.ResumableStore(
@@ -47,6 +47,8 @@ class OssUtilities:
        self.api = HubApi()

    def _do_init(self, oss_config):
+        import oss2
+
        self.key = oss_config[ACCESS_ID]
        self.secret = oss_config[ACCESS_SECRET]
        self.token = oss_config[SECURITY_TOKEN]
@@ -78,6 +80,7 @@ class OssUtilities:

    def download(self, oss_file_name: str,
                 download_config: DataDownloadConfig):
+        import oss2
        cache_dir = download_config.cache_dir
        candidate_key = os.path.join(self.oss_dir, oss_file_name)
        candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
@@ -126,6 +129,7 @@ class OssUtilities:
    def upload(self, oss_object_name: str, local_file_path: str,
               indicate_individual_progress: bool,
               upload_mode: UploadMode) -> str:
+        import oss2
        retry_count = 0
        object_key = os.path.join(self.oss_dir, oss_object_name)

--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -3,6 +3,7 @@
 import ast
 import functools
 import importlib
+import inspect
 import logging
 import os
 import os.path as osp
@@ -480,3 +481,23 @@ class LazyImportModule(ModuleType):
            importlib.import_module(module_name)
        else:
            logger.warning(f'{signature} not found in ast index file')
+
+
+def has_attr_in_class(cls, attribute_name) -> bool:
+    """
+    Determine if attribute in specific class.
+
+    Args:
+        cls: target class.
+        attribute_name: the attribute name.
+
+    Returns:
+        The attribute in the class or not.
+    """
+    init_method = cls.__init__
+    signature = inspect.signature(init_method)
+
+    parameters = signature.parameters
+    param_names = list(parameters.keys())
+
+    return attribute_name in param_names
--- a/requirements/datasets.txt
+++ b/requirements/datasets.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=3.0.0,<=3.0.1
+datasets>=3.0.0,<=3.2.0
 einops
 oss2
 Pillow
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,8 +1,7 @@
 addict
 attrs
-datasets>=3.0.0,<=3.0.1
+datasets>=3.0.0,<=3.2.0
 einops
-oss2
 Pillow
 python-dateutil>=2.1
 scipy