[Fix | Refactor] Fix preview loading and refactor hf datasets utils (#1654)

(cherry picked from commit d4301d192c)
2026-05-18 05:05:00 +02:00 · 2026-03-27 15:28:57 +08:00
parent 8b6d450055
commit c8151df475
6 changed files with 1159 additions and 894 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,7 +20,8 @@ repos:
            (?x)^(
                examples/|
                modelscope/utils/ast_index_file.py|
-                modelscope/fileio/format/jsonplus.py
+                modelscope/fileio/format/jsonplus.py|
                modelscope/msdatasets/utils/_module_factories\.py
            )$
  - repo: https://github.com/pre-commit/mirrors-yapf.git
    rev: v0.30.0
@@ -31,7 +32,8 @@ repos:
                thirdparty/|
                examples/|
                modelscope/utils/ast_index_file.py|
-                modelscope/fileio/format/jsonplus.py
+                modelscope/fileio/format/jsonplus.py|
                modelscope/msdatasets/utils/_module_factories\.py
            )$
  - repo: https://github.com/pre-commit/pre-commit-hooks.git
    rev: v3.1.0
--- a/modelscope/msdatasets/utils/_compat.py
+++ b/modelscope/msdatasets/utils/_compat.py
@@ -0,0 +1,288 @@
 # isort: skip_file
 # yapf: disable
 # Copyright (c) Alibaba, Inc. and its affiliates.
 """Compatibility shims for datasets>=4.0 script-based dataset loading.
 Script-based dataset loading was removed in datasets 4.0. This module
 provides minimal re-implementations of the necessary helpers so that
 ModelScope can still load datasets that ship a custom builder .py script.
 When running with datasets<4.0 the real implementations are simply
 re-exported from datasets.load / datasets.utils.py_utils.
 """
 import importlib
 import os
 import sys
 from pathlib import Path
 from typing import List, Optional, Tuple
 from datasets import DownloadMode, config
 # ---------------------------------------------------------------------------
 # Try importing script-loading APIs from datasets<4.0
 # ---------------------------------------------------------------------------
 try:
    from datasets.load import (
        HubDatasetModuleFactoryWithScript,
        LocalDatasetModuleFactoryWithScript,
        resolve_trust_remote_code,
        _get_importable_file_path,
        _create_importable_file,
        _load_importable_file,
        init_dynamic_modules,
        files_to_hash,
    )
    from datasets.utils.py_utils import get_imports
    _HAS_SCRIPT_LOADING = True
 except ImportError:
    _HAS_SCRIPT_LOADING = False
    HubDatasetModuleFactoryWithScript = None  # type: ignore[assignment,misc]
    LocalDatasetModuleFactoryWithScript = None  # type: ignore[assignment,misc]
 # ---------------------------------------------------------------------------
 # Compat implementations (only defined when datasets>=4.0)
 # ---------------------------------------------------------------------------
 if not _HAS_SCRIPT_LOADING:
    import filecmp
    import hashlib  # noqa: F811 – only imported in this branch
    import json as _json
    import re
    import shutil
    from urllib.parse import urlparse
    from datasets.packaged_modules import _hash_python_lines
    from datasets.utils.file_utils import url_or_path_join
    from datasets.utils.hub import hf_dataset_url  # noqa: F401
    from filelock import FileLock
    def _compat_get_imports(
            file_path: str) -> List[Tuple[str, str, str, Optional[str]]]:
        """Parse a dataset script for import statements (ported from datasets<4.0)."""
        with open(file_path, encoding='utf-8') as f:
            lines = f.readlines()
        imports: List[Tuple[str, str, str, Optional[str]]] = []
        is_in_docstring = False
        for line in lines:
            docstr_start_match = re.findall(r'[\s\S]*?"""[\s\S]*?', line)
            if len(docstr_start_match) == 1:
                is_in_docstring = not is_in_docstring
            if is_in_docstring:
                continue
            match = re.match(
                r'^import\s+(\.?)([^\s\.]+)[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*)',
                line,
                flags=re.MULTILINE)
            if match is None:
                match = re.match(
                    r'^from\s+(\.?)([^\s\.]+)(?:[^\s]*)\s+import\s+[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*)',
                    line,
                    flags=re.MULTILINE)
            if match is None:
                continue
            if match.group(1):
                if any(imp[1] == match.group(2) for imp in imports):
                    continue
                if match.group(3):
                    url_path = match.group(3)
                    url_path, sub_directory = _compat_convert_github_url(
                        url_path)
                    imports.append(
                        ('external', match.group(2), url_path, sub_directory))
                elif match.group(2):
                    imports.append(
                        ('internal', match.group(2), match.group(2), None))
            else:
                if match.group(3):
                    imports.append(
                        ('library', match.group(2), match.group(3), None))
                else:
                    imports.append(
                        ('library', match.group(2), match.group(2), None))
        return imports
    def _compat_convert_github_url(url_path: str) -> Tuple[str, Optional[str]]:
        parsed = urlparse(url_path)
        sub_directory = None
        if parsed.scheme in ('http', 'https',
                             's3') and parsed.netloc == 'github.com':
            if 'blob' in url_path:
                if not url_path.endswith('.py'):
                    raise ValueError(
                        f'External import from github at {url_path} should point to a .py file'
                    )
                url_path = url_path.replace('blob', 'raw')
            else:
                github_path = parsed.path[1:]
                repo_info, branch = (
                    github_path.split('/tree/') if '/tree/' in github_path else
                    (github_path, 'master'))
                repo_owner, repo_name = repo_info.split('/')
                url_path = f'https://github.com/{repo_owner}/{repo_name}/archive/{branch}.zip'
                sub_directory = f'{repo_name}-{branch}'
        return url_path, sub_directory
    # -- dynamic module management ----------------------------------------
    def _compat_init_dynamic_modules(
        name: str = config.MODULE_NAME_FOR_DYNAMIC_MODULES,
        hf_modules_cache=None,
    ) -> str:
        hf_modules_cache = str(hf_modules_cache or config.HF_MODULES_CACHE)
        if hf_modules_cache not in sys.path:
            sys.path.append(hf_modules_cache)
        os.makedirs(hf_modules_cache, exist_ok=True)
        init_path = os.path.join(hf_modules_cache, '__init__.py')
        if not os.path.exists(init_path):
            with open(init_path, 'w'):
                pass
            importlib.invalidate_caches()
        dynamic_modules_path = os.path.join(hf_modules_cache, name)
        os.makedirs(dynamic_modules_path, exist_ok=True)
        init_path2 = os.path.join(dynamic_modules_path, '__init__.py')
        if not os.path.exists(init_path2):
            with open(init_path2, 'w'):
                pass
        return dynamic_modules_path
    def _compat_files_to_hash(file_paths) -> str:
        to_use_files: list = []
        for fp in file_paths:
            if os.path.isdir(fp):
                to_use_files.extend(list(Path(fp).rglob('*.[pP][yY]')))
            else:
                to_use_files.append(fp)
        lines: list = []
        for fp in to_use_files:
            with open(fp, encoding='utf-8') as f:
                lines.extend(f.readlines())
        return _hash_python_lines(lines)
    # -- importable file management ---------------------------------------
    def _compat_get_importable_file_path(
        dynamic_modules_path: str,
        module_namespace: str,
        subdirectory_name: str,
        name: str,
    ) -> str:
        importable_dir = os.path.join(dynamic_modules_path, module_namespace,
                                      name.replace('/', '--'))
        return os.path.join(importable_dir, subdirectory_name,
                            name.split('/')[-1] + '.py')
    def _compat_copy_script_and_resources(
        name: str,
        importable_directory_path: str,
        subdirectory_name: str,
        original_local_path: str,
        local_imports: List[Tuple[str, str]],
        additional_files: List[Tuple[str, str]],
        download_mode,
    ) -> str:
        importable_subdirectory = os.path.join(importable_directory_path,
                                               subdirectory_name)
        importable_file = os.path.join(importable_subdirectory, name + '.py')
        lock_path = importable_directory_path + '.lock'
        with FileLock(lock_path):
            if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
                    importable_directory_path):
                shutil.rmtree(importable_directory_path)
            os.makedirs(importable_directory_path, exist_ok=True)
            init_fp = os.path.join(importable_directory_path, '__init__.py')
            if not os.path.exists(init_fp):
                with open(init_fp, 'w'):
                    pass
            os.makedirs(importable_subdirectory, exist_ok=True)
            init_fp2 = os.path.join(importable_subdirectory, '__init__.py')
            if not os.path.exists(init_fp2):
                with open(init_fp2, 'w'):
                    pass
            if not os.path.exists(importable_file):
                shutil.copyfile(original_local_path, importable_file)
                meta_path = os.path.splitext(importable_file)[0] + '.json'
                if not os.path.exists(meta_path):
                    meta = {
                        'original file path': original_local_path,
                        'local file path': importable_file
                    }
                    with open(meta_path, 'w', encoding='utf-8') as mf:
                        _json.dump(meta, mf)
            for imp_name, imp_path in local_imports:
                if os.path.isfile(imp_path):
                    dest = os.path.join(importable_subdirectory,
                                        imp_name + '.py')
                    if not os.path.exists(dest):
                        shutil.copyfile(imp_path, dest)
                elif os.path.isdir(imp_path):
                    dest = os.path.join(importable_subdirectory, imp_name)
                    if not os.path.exists(dest):
                        shutil.copytree(imp_path, dest)
                else:
                    raise ImportError(f'Error with local import at {imp_path}')
            for file_name, original_path in additional_files:
                dest_path = os.path.join(importable_subdirectory, file_name)
                if not os.path.exists(dest_path) or not filecmp.cmp(
                        original_path, dest_path):
                    shutil.copyfile(original_path, dest_path)
        return importable_file
    def _compat_create_importable_file(
        local_path: str,
        local_imports: List[Tuple[str, str]],
        additional_files: List[Tuple[str, str]],
        dynamic_modules_path: str,
        module_namespace: str,
        subdirectory_name: str,
        name: str,
        download_mode,
    ) -> None:
        importable_dir = os.path.join(dynamic_modules_path, module_namespace,
                                      name.replace('/', '--'))
        Path(importable_dir).mkdir(parents=True, exist_ok=True)
        (Path(importable_dir).parent / '__init__.py').touch(exist_ok=True)
        _compat_copy_script_and_resources(
            name=name.split('/')[-1],
            importable_directory_path=importable_dir,
            subdirectory_name=subdirectory_name,
            original_local_path=local_path,
            local_imports=local_imports,
            additional_files=additional_files,
            download_mode=download_mode,
        )
    def _compat_load_importable_file(
        dynamic_modules_path: str,
        module_namespace: str,
        subdirectory_name: str,
        name: str,
    ) -> Tuple[str, str]:
        module_path = '.'.join([
            os.path.basename(dynamic_modules_path),
            module_namespace,
            name.replace('/', '--'),
            subdirectory_name,
            name.split('/')[-1],
        ])
        return module_path, subdirectory_name
    # -- trust handling ---------------------------------------------------
    def _compat_resolve_trust_remote_code(trust_remote_code, repo_id: str):
        if trust_remote_code is None:
            raise ValueError(
                f'The repository for {repo_id} contains custom code which must be '
                f'executed to correctly load the dataset. You can inspect the repository '
                f'content at the Hub.\nPlease pass the argument `trust_remote_code=True` '
                f'to allow custom code to be run.')
        return trust_remote_code
    # -- Assign compat functions to canonical names -----------------------
    get_imports = _compat_get_imports  # noqa: F811
    init_dynamic_modules = _compat_init_dynamic_modules  # noqa: F811
    files_to_hash = _compat_files_to_hash  # noqa: F811
    resolve_trust_remote_code = _compat_resolve_trust_remote_code  # noqa: F811
    _get_importable_file_path = _compat_get_importable_file_path  # noqa: F811
    _create_importable_file = _compat_create_importable_file  # noqa: F811
    _load_importable_file = _compat_load_importable_file  # noqa: F811
--- a/modelscope/msdatasets/utils/_module_factories.py
+++ b/modelscope/msdatasets/utils/_module_factories.py
@@ -0,0 +1,656 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 """Dataset module factory functions and data file resolution for ModelScope.
 This module provides ModelScope-specific implementations of dataset module
 loading (both script-based and script-free) and data file pattern resolution.
 These functions are monkey-patched onto the ``datasets`` library internals
 by :func:`~hf_datasets_util.load_dataset_with_ctx`.
 """
 import importlib
 import inspect
 import os
 from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 from datasets import (BuilderConfig, DownloadConfig, DownloadMode, Features,
                      Version, config, data_files)
 from datasets.data_files import (
    FILES_TO_IGNORE, DataFilesDict, EmptyDatasetError,
    _get_data_files_patterns, _is_inside_unrequested_special_dir,
    _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir,
    sanitize_patterns)
 from datasets.download.streaming_download_manager import (
    _prepare_path_and_storage_options, xbasename, xjoin)
 from datasets.exceptions import DataFilesNotFoundError
 from datasets.info import DatasetInfosDict
 from datasets.load import (BuilderConfigsParameters, DatasetModule,
                           create_builder_configs_from_metadata_configs,
                           get_dataset_builder_class, import_main_class,
                           infer_module_for_data_files)
 from datasets.naming import camelcase_to_snakecase
 from datasets.packaged_modules import (_MODULE_TO_EXTENSIONS,
                                       _PACKAGED_DATASETS_MODULES)
 from datasets.utils.file_utils import (cached_path, is_local_path,
                                       relative_to_absolute_path)
 from datasets.utils.metadata import MetadataConfigs
 from datasets.utils.track import tracked_str
 from fsspec import filesystem
 from fsspec.core import _un_chain
 from fsspec.utils import stringify_path
 from huggingface_hub import DatasetCard, DatasetCardData
 from packaging import version
 from modelscope import HubApi
 from modelscope.msdatasets.utils._compat import (
    _HAS_SCRIPT_LOADING, _create_importable_file, _get_importable_file_path,
    _load_importable_file, files_to_hash, get_imports, init_dynamic_modules,
    resolve_trust_remote_code)
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                       REPO_TYPE_DATASET)
 from modelscope.utils.file_utils import is_relative_path
 from modelscope.utils.import_utils import has_attr_in_class
 from modelscope.utils.logger import get_logger
 # ALL_ALLOWED_EXTENSIONS moved to datasets.packaged_modules in datasets 4.0
 try:
    from datasets.packaged_modules import _ALL_ALLOWED_EXTENSIONS as ALL_ALLOWED_EXTENSIONS
 except ImportError:
    from datasets.load import ALL_ALLOWED_EXTENSIONS
 logger = get_logger()
 # ---------------------------------------------------------------------------
 # Shared HubApi instance (avoids creating a new requests.Session per call)
 # ---------------------------------------------------------------------------
 _hub_api: Optional[HubApi] = None
 def _get_hub_api() -> HubApi:
    global _hub_api
    if _hub_api is None:
        _hub_api = HubApi(timeout=3 * 60, max_retries=3)
    return _hub_api
 # ===================================================================
 # Data file resolution
 # ===================================================================
 def get_fs_token_paths(
    urlpath,
    storage_options=None,
    protocol=None,
 ):
    if isinstance(urlpath, (list, tuple, set)):
        if not urlpath:
            raise ValueError('empty urlpath sequence')
        urlpath0 = stringify_path(list(urlpath)[0])
    else:
        urlpath0 = stringify_path(urlpath)
    storage_options = storage_options or {}
    if protocol:
        storage_options['protocol'] = protocol
    chain = _un_chain(urlpath0, storage_options or {})
    inkwargs = {}
    for i, ch in enumerate(reversed(chain)):
        urls, nested_protocol, kw = ch
        if i == len(chain) - 1:
            inkwargs = dict(**kw, **inkwargs)
            continue
        inkwargs['target_options'] = dict(**kw, **inkwargs)
        inkwargs['target_protocol'] = nested_protocol
        inkwargs['fo'] = urls
    paths, protocol, _ = chain[0]
    fs = filesystem(protocol, **inkwargs)
    return fs
 def _resolve_pattern(
    pattern: str,
    base_path: str,
    allowed_extensions: Optional[List[str]] = None,
    download_config: Optional[DownloadConfig] = None,
 ) -> List[str]:
    """Resolve data file paths/URLs from a user-supplied pattern.
    Supports ``*``, ``**``, and fsspec-based remote patterns (e.g. ``hf://``).
    Hidden files/directories and ``__pycache__`` are excluded by default.
    """
    if is_relative_path(pattern):
        pattern = xjoin(base_path, pattern)
    elif is_local_path(pattern):
        base_path = os.path.splitdrive(pattern)[0] + os.sep
    else:
        base_path = ''
    pattern, storage_options = _prepare_path_and_storage_options(
        pattern, download_config=download_config)
    fs = get_fs_token_paths(pattern, storage_options=storage_options)
    fs_base_path = base_path.split('::')[0].split('://')[-1] or fs.root_marker
    fs_pattern = pattern.split('::')[0].split('://')[-1]
    files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
    protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
    protocol_prefix = protocol + '://' if protocol != 'file' else ''
    glob_kwargs = {}
    if protocol == 'hf' and config.HF_HUB_VERSION >= version.parse('0.20.0'):
        glob_kwargs['expand_info'] = False
    try:
        tmp_file_paths = fs.glob(pattern, detail=True, **glob_kwargs)
    except FileNotFoundError:
        raise DataFilesNotFoundError(f"Unable to find '{pattern}'")
    matched_paths = [
        filepath if filepath.startswith(protocol_prefix) else protocol_prefix
        + filepath for filepath, info in tmp_file_paths.items()
        if info['type'] == 'file' and (
            xbasename(filepath) not in files_to_ignore)
        and not _is_inside_unrequested_special_dir(
            os.path.relpath(filepath, fs_base_path),
            os.path.relpath(fs_pattern, fs_base_path)) and  # noqa: W504
        not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(  # noqa: W504
            os.path.relpath(filepath, fs_base_path),
            os.path.relpath(fs_pattern, fs_base_path))
    ]
    if allowed_extensions is not None:
        out = [
            filepath for filepath in matched_paths
            if any('.' + suffix in allowed_extensions
                   for suffix in xbasename(filepath).split('.')[1:])
        ]
        if len(out) < len(matched_paths):
            invalid_matched_files = list(set(matched_paths) - set(out))
            logger.info(
                f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: "
                f'{invalid_matched_files}')
    else:
        out = matched_paths
    if not out:
        error_msg = f"Unable to find '{pattern}'"
        if allowed_extensions is not None:
            error_msg += f' with any supported extension {list(allowed_extensions)}'
        raise FileNotFoundError(error_msg)
    return out
 def _get_data_patterns(
        base_path: str,
        download_config: Optional[DownloadConfig] = None
 ) -> Dict[str, List[str]]:
    """Get data file patterns for a dataset directory.
    Tries ``SPLIT_PATTERN_SHARDED`` first, then falls back to
    ``ALL_DEFAULT_PATTERNS``.
    """
    resolver = partial(
        _resolve_pattern, base_path=base_path, download_config=download_config)
    try:
        return _get_data_files_patterns(resolver)
    except FileNotFoundError:
        raise EmptyDatasetError(
            f"The directory at {base_path} doesn't contain any data files"
        ) from None
 # ===================================================================
 # Repository file download helper
 # ===================================================================
 def _download_repo_file(
    repo_id: str,
    path_in_repo: str,
    download_config: DownloadConfig,
    revision: str,
 ) -> str:
    """Download a single file from a ModelScope dataset repository."""
    api = _get_hub_api()
    _namespace, _dataset_name = repo_id.split('/')
    endpoint = api.get_endpoint_for_read(
        repo_id=repo_id, repo_type=REPO_TYPE_DATASET)
    if download_config and download_config.download_desc is None:
        download_config.download_desc = f'Downloading [{path_in_repo}]'
    try:
        url_or_filename = api.get_dataset_file_url(
            file_name=path_in_repo,
            dataset_name=_dataset_name,
            namespace=_namespace,
            revision=revision,
            extension_filter=False,
            endpoint=endpoint,
        )
        repo_file_path = cached_path(
            url_or_filename=url_or_filename, download_config=download_config)
    except FileNotFoundError as e:
        repo_file_path = ''
        logger.error(e)
    return repo_file_path
 # ===================================================================
 # Additional modules download (for script-based datasets)
 # ===================================================================
 def _download_additional_modules(
    name: str,
    dataset_name: str,
    namespace: str,
    revision: str,
    imports: Tuple[str, str, str, str],
    download_config: Optional[DownloadConfig],
    trust_remote_code: Optional[bool] = False,
 ) -> List[Tuple[str, str]]:
    """Download additional modules referenced by a dataset builder script.
    Parses the import list produced by ``get_imports`` and downloads any
    internal (relative) or external modules. Library imports are validated
    but not downloaded.
    """
    local_imports: List[Tuple[str, str]] = []
    library_imports: List[Tuple[str, str]] = []
    has_remote_code = any(
        import_type in ('internal', 'external')
        for import_type, _, _, _ in imports)
    if has_remote_code and not trust_remote_code:
        raise ValueError(
            f'Loading {name} requires executing code from the repository. '
            'This is disabled by default for security reasons. '
            'If you trust the authors of this dataset, you can enable it with '
            '`trust_remote_code=True`.')
    api = _get_hub_api()
    download_config = download_config.copy()
    if download_config.download_desc is None:
        download_config.download_desc = 'Downloading extra modules'
    for import_type, import_name, import_path, sub_directory in imports:
        if import_type == 'library':
            library_imports.append((import_name, import_path))
            continue
        if import_name == name:
            raise ValueError(
                f'Error in the {name} script, importing relative {import_name} module '
                f'but {import_name} is the name of the script. '
                f"Please change relative import {import_name} to another name and add a '# From: URL_OR_PATH' "
                f'comment pointing to the original relative import file path.')
        if import_type == 'internal':
            file_name = import_path + '.py'
            url_or_filename = api.get_dataset_file_url(
                file_name=file_name,
                dataset_name=dataset_name,
                namespace=namespace,
                revision=revision,
            )
        elif import_type == 'external':
            url_or_filename = import_path
        else:
            raise ValueError('Wrong import_type')
        local_import_path = cached_path(
            url_or_filename, download_config=download_config)
        if sub_directory is not None:
            local_import_path = os.path.join(local_import_path, sub_directory)
        local_imports.append((import_name, local_import_path))
    # Validate library imports
    needs_to_be_installed = {}
    for library_import_name, library_import_path in library_imports:
        try:
            importlib.import_module(library_import_name)
        except ImportError:
            if library_import_name not in needs_to_be_installed or library_import_path != library_import_name:
                needs_to_be_installed[
                    library_import_name] = library_import_path
    if needs_to_be_installed:
        _dependencies_str = 'dependencies' if len(
            needs_to_be_installed) > 1 else 'dependency'
        _them_str = 'them' if len(needs_to_be_installed) > 1 else 'it'
        if 'sklearn' in needs_to_be_installed:
            needs_to_be_installed['sklearn'] = 'scikit-learn'
        if 'Bio' in needs_to_be_installed:
            needs_to_be_installed['Bio'] = 'biopython'
        raise ImportError(
            f'To be able to use {name}, you need to install the following {_dependencies_str}: '
            f"{', '.join(needs_to_be_installed)}.\nPlease install {_them_str} using 'pip install "
            f"{' '.join(needs_to_be_installed.values())}' for instance.")
    return local_imports
 # ===================================================================
 # Module factory: script-based (Hub)
 # ===================================================================
 def _load_script_module(
    repo_id: str,
    revision: str,
    download_config: DownloadConfig,
    download_mode=None,
    dynamic_modules_path: Optional[str] = None,
    trust_remote_code: Optional[bool] = None,
 ) -> DatasetModule:
    """Shared implementation for loading a dataset module from a Hub .py script.
    Used by both ``get_module_with_script`` (monkey-patch for datasets<4.0) and
    ``_compat_hub_script_module`` (compat shim for datasets>=4.0).
    """
    _namespace, _dataset_name = repo_id.split('/')
    script_file_name = f'{_dataset_name}.py'
    local_script_path = _download_repo_file(
        repo_id=repo_id,
        path_in_repo=script_file_name,
        download_config=download_config,
        revision=revision,
    )
    if not local_script_path:
        raise FileNotFoundError(
            f'Cannot find {script_file_name} in {repo_id} at revision {revision}.'
        )
    dataset_readme_path = _download_repo_file(
        repo_id=repo_id,
        path_in_repo='README.md',
        download_config=download_config,
        revision=revision,
    )
    imports = get_imports(local_script_path)
    local_imports = _download_additional_modules(
        name=repo_id,
        dataset_name=_dataset_name,
        namespace=_namespace,
        revision=revision,
        imports=imports,
        download_config=download_config,
        trust_remote_code=trust_remote_code,
    )
    additional_files = []
    if dataset_readme_path:
        additional_files.append(
            (config.REPOCARD_FILENAME, dataset_readme_path))
    dynamic_modules_path = dynamic_modules_path or init_dynamic_modules()
    hash_val = files_to_hash([local_script_path]
                             + [loc[1] for loc in local_imports])
    importable_file_path = _get_importable_file_path(
        dynamic_modules_path=dynamic_modules_path,
        module_namespace='datasets',
        subdirectory_name=hash_val,
        name=repo_id,
    )
    if not os.path.exists(importable_file_path):
        trust = resolve_trust_remote_code(
            trust_remote_code=trust_remote_code, repo_id=repo_id)
        if trust:
            logger.warning(
                f'Use trust_remote_code=True. Will invoke codes from {repo_id}. '
                'Please make sure that you can trust the external codes.')
            _create_importable_file(
                local_path=local_script_path,
                local_imports=local_imports,
                additional_files=additional_files,
                dynamic_modules_path=dynamic_modules_path,
                module_namespace='datasets',
                subdirectory_name=hash_val,
                name=repo_id,
                download_mode=download_mode,
            )
        else:
            raise ValueError(
                f'Loading {repo_id} requires executing the dataset script in that'
                ' repo on your local machine. Make sure you have read the code there to avoid malicious use, then'
                ' set the option `trust_remote_code=True` to remove this error.'
            )
    module_path, hash_val = _load_importable_file(
        dynamic_modules_path=dynamic_modules_path,
        module_namespace='datasets',
        subdirectory_name=hash_val,
        name=repo_id,
    )
    importlib.invalidate_caches()
    api = _get_hub_api()
    builder_kwargs = {
        'base_path': api.get_file_base_path(repo_id=repo_id),
        'repo_id': repo_id,
    }
    return DatasetModule(module_path, hash_val, builder_kwargs)
 def get_module_with_script(self) -> DatasetModule:
    """Monkey-patch target for ``HubDatasetModuleFactoryWithScript.get_module`` (datasets<4.0)."""
    repo_id: str = self.name
    revision = self.download_config.storage_options.get(
        'revision', None) or DEFAULT_DATASET_REVISION
    return _load_script_module(
        repo_id=repo_id,
        revision=revision,
        download_config=self.download_config,
        download_mode=self.download_mode,
        dynamic_modules_path=self.dynamic_modules_path
        if self.dynamic_modules_path else None,
        trust_remote_code=self.trust_remote_code,
    )
 def _compat_hub_script_module(
    path,
    revision=None,
    download_config=None,
    download_mode=None,
    dynamic_modules_path=None,
    trust_remote_code=None,
 ) -> DatasetModule:
    """Load a dataset module from a Hub repo .py script (compat for datasets>=4.0)."""
    return _load_script_module(
        repo_id=path,
        revision=revision or DEFAULT_DATASET_REVISION,
        download_config=download_config or DownloadConfig(),
        download_mode=download_mode,
        dynamic_modules_path=dynamic_modules_path,
        trust_remote_code=trust_remote_code,
    )
 # ===================================================================
 # Module factory: script-based (local)
 # ===================================================================
 def _compat_local_script_module(
    path,
    download_mode=None,
    dynamic_modules_path=None,
    trust_remote_code=None,
 ) -> DatasetModule:
    """Load a dataset module from a local .py script (compat for datasets>=4.0)."""
    local_path = path
    name = Path(path).stem
    local_imports: List[Tuple[str, str]] = []
    imports = get_imports(local_path)
    for import_type, import_name, import_path, sub_directory in imports:
        if import_type == 'library':
            continue
        if import_type == 'internal':
            rel_path = os.path.join(
                os.path.dirname(local_path), import_path + '.py')
            if os.path.isfile(rel_path):
                local_imports.append((import_name, rel_path))
            elif os.path.isdir(
                    os.path.join(os.path.dirname(local_path), import_path)):
                local_imports.append(
                    (import_name,
                     os.path.join(os.path.dirname(local_path), import_path)))
        elif import_type == 'external':
            dl_config = DownloadConfig()
            dl_config.download_desc = 'Downloading extra modules'
            local_import_path = cached_path(
                import_path, download_config=dl_config)
            if sub_directory is not None:
                local_import_path = os.path.join(local_import_path,
                                                 sub_directory)
            local_imports.append((import_name, local_import_path))
    dynamic_modules_path = dynamic_modules_path or init_dynamic_modules()
    hash_val = files_to_hash([local_path] + [loc[1] for loc in local_imports])
    importable_file_path = _get_importable_file_path(
        dynamic_modules_path=dynamic_modules_path,
        module_namespace='datasets',
        subdirectory_name=hash_val,
        name=name,
    )
    if not os.path.exists(importable_file_path):
        trust = resolve_trust_remote_code(trust_remote_code, name)
        if trust:
            _create_importable_file(
                local_path=local_path,
                local_imports=local_imports,
                additional_files=[],
                dynamic_modules_path=dynamic_modules_path,
                module_namespace='datasets',
                subdirectory_name=hash_val,
                name=name,
                download_mode=download_mode,
            )
        else:
            raise ValueError(
                f'Loading {name} requires executing the dataset script. '
                'Set `trust_remote_code=True` to allow this.')
    module_path, hash_val = _load_importable_file(
        dynamic_modules_path=dynamic_modules_path,
        module_namespace='datasets',
        subdirectory_name=hash_val,
        name=name,
    )
    importlib.invalidate_caches()
    builder_kwargs = {
        'base_path': str(Path(path).resolve().parent),
    }
    return DatasetModule(module_path, hash_val, builder_kwargs)
 # ===================================================================
 # Module factory: without script (Hub)
 # ===================================================================
 def get_module_without_script(self) -> DatasetModule:
    """Monkey-patch target for ``HubDatasetModuleFactoryWithoutScript.get_module``."""
    revision = self.download_config.storage_options.get(
        'revision', None) or DEFAULT_DATASET_REVISION
    base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip(
        '/')
    repo_id: str = self.name
    download_config = self.download_config.copy()
    dataset_readme_path = _download_repo_file(
        repo_id=repo_id,
        path_in_repo='README.md',
        download_config=download_config,
        revision=revision,
    )
    dataset_card_data = DatasetCard.load(
        Path(dataset_readme_path
             )).data if dataset_readme_path else DatasetCardData()
    subset_name: str = download_config.storage_options.get('name', None)
    metadata_configs = MetadataConfigs.from_dataset_card_data(
        dataset_card_data)
    dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
    if self.data_files is not None:
        patterns = sanitize_patterns(self.data_files)
    elif metadata_configs and 'data_files' in next(
            iter(metadata_configs.values())):
        if subset_name is not None:
            subset_data_files = metadata_configs[subset_name]['data_files']
        else:
            subset_data_files = next(iter(
                metadata_configs.values()))['data_files']
        patterns = sanitize_patterns(subset_data_files)
    else:
        patterns = _get_data_patterns(
            base_path, download_config=self.download_config)
    data_files_dict = DataFilesDict.from_patterns(
        patterns,
        base_path=base_path,
        allowed_extensions=ALL_ALLOWED_EXTENSIONS,
        download_config=self.download_config,
    )
    module_name, default_builder_kwargs = infer_module_for_data_files(
        data_files=data_files_dict,
        path=self.name,
        download_config=self.download_config,
    )
    if hasattr(data_files_dict, 'filter'):
        data_files_dict = data_files_dict.filter(
            extensions=_MODULE_TO_EXTENSIONS[module_name])
    else:
        data_files_dict = data_files_dict.filter_extensions(
            _MODULE_TO_EXTENSIONS[module_name])
    module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
    if metadata_configs:
        supports_metadata = module_name in {'imagefolder', 'audiofolder'}
        create_builder_signature = inspect.signature(
            create_builder_configs_from_metadata_configs)
        in_args = {
            'module_path': module_path,
            'metadata_configs': metadata_configs,
            'base_path': base_path,
            'default_builder_kwargs': default_builder_kwargs,
            'download_config': self.download_config,
        }
        if 'supports_metadata' in create_builder_signature.parameters:
            in_args['supports_metadata'] = supports_metadata
        builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
            **in_args)
    else:
        builder_configs: List[BuilderConfig] = [
            import_main_class(module_path).BUILDER_CONFIG_CLASS(
                data_files=data_files_dict,
                **default_builder_kwargs,
            )
        ]
        default_config_name = None
    api = _get_hub_api()
    endpoint = api.get_endpoint_for_read(
        repo_id=repo_id, repo_type=REPO_TYPE_DATASET)
    builder_kwargs = {
        'base_path':
        api.get_file_base_path(repo_id=repo_id, endpoint=endpoint),
        'repo_id': self.name,
        'dataset_name': camelcase_to_snakecase(Path(self.name).name),
        'data_files': data_files_dict,
    }
    download_config = self.download_config.copy()
    if download_config.download_desc is None:
        download_config.download_desc = 'Downloading metadata'
    if default_config_name is None and len(dataset_infos) == 1:
        default_config_name = next(iter(dataset_infos))
    return DatasetModule(
        module_path,
        revision,
        builder_kwargs,
        dataset_infos=dataset_infos,
        builder_configs_parameters=BuilderConfigsParameters(
            metadata_configs=metadata_configs,
            builder_configs=builder_configs,
            default_config_name=default_config_name,
        ),
    )
--- a/modelscope/msdatasets/utils/hf_datasets_util.py
+++ b/modelscope/msdatasets/utils/hf_datasets_util.py
--- a/requirements/datasets.txt
+++ b/requirements/datasets.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=4.0.0,<=4.6.1
+datasets>=4.0.0,<=4.8.4
 einops
 oss2
 Pillow
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=4.0.0,<=4.6.1
+datasets>=4.0.0,<=4.8.4
 einops
 Pillow
 python-dateutil>=2.1