mirror of
https://github.com/modelscope/modelscope.git
synced 2026-05-18 05:05:00 +02:00
[Fix | Refactor] Fix preview loading and refactor hf datasets utils (#1654)
This commit is contained in:
@@ -20,7 +20,8 @@ repos:
|
||||
(?x)^(
|
||||
examples/|
|
||||
modelscope/utils/ast_index_file.py|
|
||||
modelscope/fileio/format/jsonplus.py
|
||||
modelscope/fileio/format/jsonplus.py|
|
||||
modelscope/msdatasets/utils/_module_factories\.py
|
||||
)$
|
||||
- repo: https://github.com/pre-commit/mirrors-yapf.git
|
||||
rev: v0.30.0
|
||||
@@ -31,7 +32,8 @@ repos:
|
||||
thirdparty/|
|
||||
examples/|
|
||||
modelscope/utils/ast_index_file.py|
|
||||
modelscope/fileio/format/jsonplus.py
|
||||
modelscope/fileio/format/jsonplus.py|
|
||||
modelscope/msdatasets/utils/_module_factories\.py
|
||||
)$
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks.git
|
||||
rev: v3.1.0
|
||||
|
||||
288
modelscope/msdatasets/utils/_compat.py
Normal file
288
modelscope/msdatasets/utils/_compat.py
Normal file
@@ -0,0 +1,288 @@
|
||||
# isort: skip_file
|
||||
# yapf: disable
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
"""Compatibility shims for datasets>=4.0 script-based dataset loading.
|
||||
|
||||
Script-based dataset loading was removed in datasets 4.0. This module
|
||||
provides minimal re-implementations of the necessary helpers so that
|
||||
ModelScope can still load datasets that ship a custom builder .py script.
|
||||
|
||||
When running with datasets<4.0 the real implementations are simply
|
||||
re-exported from datasets.load / datasets.utils.py_utils.
|
||||
"""
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from datasets import DownloadMode, config
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Try importing script-loading APIs from datasets<4.0
|
||||
# ---------------------------------------------------------------------------
|
||||
try:
|
||||
from datasets.load import (
|
||||
HubDatasetModuleFactoryWithScript,
|
||||
LocalDatasetModuleFactoryWithScript,
|
||||
resolve_trust_remote_code,
|
||||
_get_importable_file_path,
|
||||
_create_importable_file,
|
||||
_load_importable_file,
|
||||
init_dynamic_modules,
|
||||
files_to_hash,
|
||||
)
|
||||
from datasets.utils.py_utils import get_imports
|
||||
|
||||
_HAS_SCRIPT_LOADING = True
|
||||
except ImportError:
|
||||
_HAS_SCRIPT_LOADING = False
|
||||
HubDatasetModuleFactoryWithScript = None # type: ignore[assignment,misc]
|
||||
LocalDatasetModuleFactoryWithScript = None # type: ignore[assignment,misc]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Compat implementations (only defined when datasets>=4.0)
|
||||
# ---------------------------------------------------------------------------
|
||||
if not _HAS_SCRIPT_LOADING:
|
||||
import filecmp
|
||||
import hashlib # noqa: F811 – only imported in this branch
|
||||
import json as _json
|
||||
import re
|
||||
import shutil
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from datasets.packaged_modules import _hash_python_lines
|
||||
from datasets.utils.file_utils import url_or_path_join
|
||||
from datasets.utils.hub import hf_dataset_url # noqa: F401
|
||||
from filelock import FileLock
|
||||
|
||||
def _compat_get_imports(
|
||||
file_path: str) -> List[Tuple[str, str, str, Optional[str]]]:
|
||||
"""Parse a dataset script for import statements (ported from datasets<4.0)."""
|
||||
with open(file_path, encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
imports: List[Tuple[str, str, str, Optional[str]]] = []
|
||||
is_in_docstring = False
|
||||
for line in lines:
|
||||
docstr_start_match = re.findall(r'[\s\S]*?"""[\s\S]*?', line)
|
||||
if len(docstr_start_match) == 1:
|
||||
is_in_docstring = not is_in_docstring
|
||||
if is_in_docstring:
|
||||
continue
|
||||
match = re.match(
|
||||
r'^import\s+(\.?)([^\s\.]+)[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*)',
|
||||
line,
|
||||
flags=re.MULTILINE)
|
||||
if match is None:
|
||||
match = re.match(
|
||||
r'^from\s+(\.?)([^\s\.]+)(?:[^\s]*)\s+import\s+[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*)',
|
||||
line,
|
||||
flags=re.MULTILINE)
|
||||
if match is None:
|
||||
continue
|
||||
if match.group(1):
|
||||
if any(imp[1] == match.group(2) for imp in imports):
|
||||
continue
|
||||
if match.group(3):
|
||||
url_path = match.group(3)
|
||||
url_path, sub_directory = _compat_convert_github_url(
|
||||
url_path)
|
||||
imports.append(
|
||||
('external', match.group(2), url_path, sub_directory))
|
||||
elif match.group(2):
|
||||
imports.append(
|
||||
('internal', match.group(2), match.group(2), None))
|
||||
else:
|
||||
if match.group(3):
|
||||
imports.append(
|
||||
('library', match.group(2), match.group(3), None))
|
||||
else:
|
||||
imports.append(
|
||||
('library', match.group(2), match.group(2), None))
|
||||
return imports
|
||||
|
||||
def _compat_convert_github_url(url_path: str) -> Tuple[str, Optional[str]]:
|
||||
parsed = urlparse(url_path)
|
||||
sub_directory = None
|
||||
if parsed.scheme in ('http', 'https',
|
||||
's3') and parsed.netloc == 'github.com':
|
||||
if 'blob' in url_path:
|
||||
if not url_path.endswith('.py'):
|
||||
raise ValueError(
|
||||
f'External import from github at {url_path} should point to a .py file'
|
||||
)
|
||||
url_path = url_path.replace('blob', 'raw')
|
||||
else:
|
||||
github_path = parsed.path[1:]
|
||||
repo_info, branch = (
|
||||
github_path.split('/tree/') if '/tree/' in github_path else
|
||||
(github_path, 'master'))
|
||||
repo_owner, repo_name = repo_info.split('/')
|
||||
url_path = f'https://github.com/{repo_owner}/{repo_name}/archive/{branch}.zip'
|
||||
sub_directory = f'{repo_name}-{branch}'
|
||||
return url_path, sub_directory
|
||||
|
||||
# -- dynamic module management ----------------------------------------
|
||||
|
||||
def _compat_init_dynamic_modules(
|
||||
name: str = config.MODULE_NAME_FOR_DYNAMIC_MODULES,
|
||||
hf_modules_cache=None,
|
||||
) -> str:
|
||||
hf_modules_cache = str(hf_modules_cache or config.HF_MODULES_CACHE)
|
||||
if hf_modules_cache not in sys.path:
|
||||
sys.path.append(hf_modules_cache)
|
||||
os.makedirs(hf_modules_cache, exist_ok=True)
|
||||
init_path = os.path.join(hf_modules_cache, '__init__.py')
|
||||
if not os.path.exists(init_path):
|
||||
with open(init_path, 'w'):
|
||||
pass
|
||||
importlib.invalidate_caches()
|
||||
dynamic_modules_path = os.path.join(hf_modules_cache, name)
|
||||
os.makedirs(dynamic_modules_path, exist_ok=True)
|
||||
init_path2 = os.path.join(dynamic_modules_path, '__init__.py')
|
||||
if not os.path.exists(init_path2):
|
||||
with open(init_path2, 'w'):
|
||||
pass
|
||||
return dynamic_modules_path
|
||||
|
||||
def _compat_files_to_hash(file_paths) -> str:
|
||||
to_use_files: list = []
|
||||
for fp in file_paths:
|
||||
if os.path.isdir(fp):
|
||||
to_use_files.extend(list(Path(fp).rglob('*.[pP][yY]')))
|
||||
else:
|
||||
to_use_files.append(fp)
|
||||
lines: list = []
|
||||
for fp in to_use_files:
|
||||
with open(fp, encoding='utf-8') as f:
|
||||
lines.extend(f.readlines())
|
||||
return _hash_python_lines(lines)
|
||||
|
||||
# -- importable file management ---------------------------------------
|
||||
|
||||
def _compat_get_importable_file_path(
|
||||
dynamic_modules_path: str,
|
||||
module_namespace: str,
|
||||
subdirectory_name: str,
|
||||
name: str,
|
||||
) -> str:
|
||||
importable_dir = os.path.join(dynamic_modules_path, module_namespace,
|
||||
name.replace('/', '--'))
|
||||
return os.path.join(importable_dir, subdirectory_name,
|
||||
name.split('/')[-1] + '.py')
|
||||
|
||||
def _compat_copy_script_and_resources(
|
||||
name: str,
|
||||
importable_directory_path: str,
|
||||
subdirectory_name: str,
|
||||
original_local_path: str,
|
||||
local_imports: List[Tuple[str, str]],
|
||||
additional_files: List[Tuple[str, str]],
|
||||
download_mode,
|
||||
) -> str:
|
||||
importable_subdirectory = os.path.join(importable_directory_path,
|
||||
subdirectory_name)
|
||||
importable_file = os.path.join(importable_subdirectory, name + '.py')
|
||||
lock_path = importable_directory_path + '.lock'
|
||||
with FileLock(lock_path):
|
||||
if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
|
||||
importable_directory_path):
|
||||
shutil.rmtree(importable_directory_path)
|
||||
os.makedirs(importable_directory_path, exist_ok=True)
|
||||
init_fp = os.path.join(importable_directory_path, '__init__.py')
|
||||
if not os.path.exists(init_fp):
|
||||
with open(init_fp, 'w'):
|
||||
pass
|
||||
os.makedirs(importable_subdirectory, exist_ok=True)
|
||||
init_fp2 = os.path.join(importable_subdirectory, '__init__.py')
|
||||
if not os.path.exists(init_fp2):
|
||||
with open(init_fp2, 'w'):
|
||||
pass
|
||||
if not os.path.exists(importable_file):
|
||||
shutil.copyfile(original_local_path, importable_file)
|
||||
meta_path = os.path.splitext(importable_file)[0] + '.json'
|
||||
if not os.path.exists(meta_path):
|
||||
meta = {
|
||||
'original file path': original_local_path,
|
||||
'local file path': importable_file
|
||||
}
|
||||
with open(meta_path, 'w', encoding='utf-8') as mf:
|
||||
_json.dump(meta, mf)
|
||||
for imp_name, imp_path in local_imports:
|
||||
if os.path.isfile(imp_path):
|
||||
dest = os.path.join(importable_subdirectory,
|
||||
imp_name + '.py')
|
||||
if not os.path.exists(dest):
|
||||
shutil.copyfile(imp_path, dest)
|
||||
elif os.path.isdir(imp_path):
|
||||
dest = os.path.join(importable_subdirectory, imp_name)
|
||||
if not os.path.exists(dest):
|
||||
shutil.copytree(imp_path, dest)
|
||||
else:
|
||||
raise ImportError(f'Error with local import at {imp_path}')
|
||||
for file_name, original_path in additional_files:
|
||||
dest_path = os.path.join(importable_subdirectory, file_name)
|
||||
if not os.path.exists(dest_path) or not filecmp.cmp(
|
||||
original_path, dest_path):
|
||||
shutil.copyfile(original_path, dest_path)
|
||||
return importable_file
|
||||
|
||||
def _compat_create_importable_file(
|
||||
local_path: str,
|
||||
local_imports: List[Tuple[str, str]],
|
||||
additional_files: List[Tuple[str, str]],
|
||||
dynamic_modules_path: str,
|
||||
module_namespace: str,
|
||||
subdirectory_name: str,
|
||||
name: str,
|
||||
download_mode,
|
||||
) -> None:
|
||||
importable_dir = os.path.join(dynamic_modules_path, module_namespace,
|
||||
name.replace('/', '--'))
|
||||
Path(importable_dir).mkdir(parents=True, exist_ok=True)
|
||||
(Path(importable_dir).parent / '__init__.py').touch(exist_ok=True)
|
||||
_compat_copy_script_and_resources(
|
||||
name=name.split('/')[-1],
|
||||
importable_directory_path=importable_dir,
|
||||
subdirectory_name=subdirectory_name,
|
||||
original_local_path=local_path,
|
||||
local_imports=local_imports,
|
||||
additional_files=additional_files,
|
||||
download_mode=download_mode,
|
||||
)
|
||||
|
||||
def _compat_load_importable_file(
|
||||
dynamic_modules_path: str,
|
||||
module_namespace: str,
|
||||
subdirectory_name: str,
|
||||
name: str,
|
||||
) -> Tuple[str, str]:
|
||||
module_path = '.'.join([
|
||||
os.path.basename(dynamic_modules_path),
|
||||
module_namespace,
|
||||
name.replace('/', '--'),
|
||||
subdirectory_name,
|
||||
name.split('/')[-1],
|
||||
])
|
||||
return module_path, subdirectory_name
|
||||
|
||||
# -- trust handling ---------------------------------------------------
|
||||
|
||||
def _compat_resolve_trust_remote_code(trust_remote_code, repo_id: str):
|
||||
if trust_remote_code is None:
|
||||
raise ValueError(
|
||||
f'The repository for {repo_id} contains custom code which must be '
|
||||
f'executed to correctly load the dataset. You can inspect the repository '
|
||||
f'content at the Hub.\nPlease pass the argument `trust_remote_code=True` '
|
||||
f'to allow custom code to be run.')
|
||||
return trust_remote_code
|
||||
|
||||
# -- Assign compat functions to canonical names -----------------------
|
||||
get_imports = _compat_get_imports # noqa: F811
|
||||
init_dynamic_modules = _compat_init_dynamic_modules # noqa: F811
|
||||
files_to_hash = _compat_files_to_hash # noqa: F811
|
||||
resolve_trust_remote_code = _compat_resolve_trust_remote_code # noqa: F811
|
||||
_get_importable_file_path = _compat_get_importable_file_path # noqa: F811
|
||||
_create_importable_file = _compat_create_importable_file # noqa: F811
|
||||
_load_importable_file = _compat_load_importable_file # noqa: F811
|
||||
656
modelscope/msdatasets/utils/_module_factories.py
Normal file
656
modelscope/msdatasets/utils/_module_factories.py
Normal file
@@ -0,0 +1,656 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
"""Dataset module factory functions and data file resolution for ModelScope.
|
||||
|
||||
This module provides ModelScope-specific implementations of dataset module
|
||||
loading (both script-based and script-free) and data file pattern resolution.
|
||||
These functions are monkey-patched onto the ``datasets`` library internals
|
||||
by :func:`~hf_datasets_util.load_dataset_with_ctx`.
|
||||
"""
|
||||
import importlib
|
||||
import inspect
|
||||
import os
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
from datasets import (BuilderConfig, DownloadConfig, DownloadMode, Features,
|
||||
Version, config, data_files)
|
||||
from datasets.data_files import (
|
||||
FILES_TO_IGNORE, DataFilesDict, EmptyDatasetError,
|
||||
_get_data_files_patterns, _is_inside_unrequested_special_dir,
|
||||
_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir,
|
||||
sanitize_patterns)
|
||||
from datasets.download.streaming_download_manager import (
|
||||
_prepare_path_and_storage_options, xbasename, xjoin)
|
||||
from datasets.exceptions import DataFilesNotFoundError
|
||||
from datasets.info import DatasetInfosDict
|
||||
from datasets.load import (BuilderConfigsParameters, DatasetModule,
|
||||
create_builder_configs_from_metadata_configs,
|
||||
get_dataset_builder_class, import_main_class,
|
||||
infer_module_for_data_files)
|
||||
from datasets.naming import camelcase_to_snakecase
|
||||
from datasets.packaged_modules import (_MODULE_TO_EXTENSIONS,
|
||||
_PACKAGED_DATASETS_MODULES)
|
||||
from datasets.utils.file_utils import (cached_path, is_local_path,
|
||||
relative_to_absolute_path)
|
||||
from datasets.utils.metadata import MetadataConfigs
|
||||
from datasets.utils.track import tracked_str
|
||||
from fsspec import filesystem
|
||||
from fsspec.core import _un_chain
|
||||
from fsspec.utils import stringify_path
|
||||
from huggingface_hub import DatasetCard, DatasetCardData
|
||||
from packaging import version
|
||||
|
||||
from modelscope import HubApi
|
||||
from modelscope.msdatasets.utils._compat import (
|
||||
_HAS_SCRIPT_LOADING, _create_importable_file, _get_importable_file_path,
|
||||
_load_importable_file, files_to_hash, get_imports, init_dynamic_modules,
|
||||
resolve_trust_remote_code)
|
||||
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
|
||||
REPO_TYPE_DATASET)
|
||||
from modelscope.utils.file_utils import is_relative_path
|
||||
from modelscope.utils.import_utils import has_attr_in_class
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
# ALL_ALLOWED_EXTENSIONS moved to datasets.packaged_modules in datasets 4.0
|
||||
try:
|
||||
from datasets.packaged_modules import _ALL_ALLOWED_EXTENSIONS as ALL_ALLOWED_EXTENSIONS
|
||||
except ImportError:
|
||||
from datasets.load import ALL_ALLOWED_EXTENSIONS
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shared HubApi instance (avoids creating a new requests.Session per call)
|
||||
# ---------------------------------------------------------------------------
|
||||
_hub_api: Optional[HubApi] = None
|
||||
|
||||
|
||||
def _get_hub_api() -> HubApi:
|
||||
global _hub_api
|
||||
if _hub_api is None:
|
||||
_hub_api = HubApi(timeout=3 * 60, max_retries=3)
|
||||
return _hub_api
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# Data file resolution
|
||||
# ===================================================================
|
||||
|
||||
|
||||
def get_fs_token_paths(
|
||||
urlpath,
|
||||
storage_options=None,
|
||||
protocol=None,
|
||||
):
|
||||
if isinstance(urlpath, (list, tuple, set)):
|
||||
if not urlpath:
|
||||
raise ValueError('empty urlpath sequence')
|
||||
urlpath0 = stringify_path(list(urlpath)[0])
|
||||
else:
|
||||
urlpath0 = stringify_path(urlpath)
|
||||
storage_options = storage_options or {}
|
||||
if protocol:
|
||||
storage_options['protocol'] = protocol
|
||||
chain = _un_chain(urlpath0, storage_options or {})
|
||||
inkwargs = {}
|
||||
for i, ch in enumerate(reversed(chain)):
|
||||
urls, nested_protocol, kw = ch
|
||||
if i == len(chain) - 1:
|
||||
inkwargs = dict(**kw, **inkwargs)
|
||||
continue
|
||||
inkwargs['target_options'] = dict(**kw, **inkwargs)
|
||||
inkwargs['target_protocol'] = nested_protocol
|
||||
inkwargs['fo'] = urls
|
||||
paths, protocol, _ = chain[0]
|
||||
fs = filesystem(protocol, **inkwargs)
|
||||
return fs
|
||||
|
||||
|
||||
def _resolve_pattern(
|
||||
pattern: str,
|
||||
base_path: str,
|
||||
allowed_extensions: Optional[List[str]] = None,
|
||||
download_config: Optional[DownloadConfig] = None,
|
||||
) -> List[str]:
|
||||
"""Resolve data file paths/URLs from a user-supplied pattern.
|
||||
|
||||
Supports ``*``, ``**``, and fsspec-based remote patterns (e.g. ``hf://``).
|
||||
Hidden files/directories and ``__pycache__`` are excluded by default.
|
||||
"""
|
||||
if is_relative_path(pattern):
|
||||
pattern = xjoin(base_path, pattern)
|
||||
elif is_local_path(pattern):
|
||||
base_path = os.path.splitdrive(pattern)[0] + os.sep
|
||||
else:
|
||||
base_path = ''
|
||||
pattern, storage_options = _prepare_path_and_storage_options(
|
||||
pattern, download_config=download_config)
|
||||
fs = get_fs_token_paths(pattern, storage_options=storage_options)
|
||||
fs_base_path = base_path.split('::')[0].split('://')[-1] or fs.root_marker
|
||||
fs_pattern = pattern.split('::')[0].split('://')[-1]
|
||||
files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
|
||||
protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
|
||||
protocol_prefix = protocol + '://' if protocol != 'file' else ''
|
||||
glob_kwargs = {}
|
||||
if protocol == 'hf' and config.HF_HUB_VERSION >= version.parse('0.20.0'):
|
||||
glob_kwargs['expand_info'] = False
|
||||
|
||||
try:
|
||||
tmp_file_paths = fs.glob(pattern, detail=True, **glob_kwargs)
|
||||
except FileNotFoundError:
|
||||
raise DataFilesNotFoundError(f"Unable to find '{pattern}'")
|
||||
|
||||
matched_paths = [
|
||||
filepath if filepath.startswith(protocol_prefix) else protocol_prefix
|
||||
+ filepath for filepath, info in tmp_file_paths.items()
|
||||
if info['type'] == 'file' and (
|
||||
xbasename(filepath) not in files_to_ignore)
|
||||
and not _is_inside_unrequested_special_dir(
|
||||
os.path.relpath(filepath, fs_base_path),
|
||||
os.path.relpath(fs_pattern, fs_base_path)) and # noqa: W504
|
||||
not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir( # noqa: W504
|
||||
os.path.relpath(filepath, fs_base_path),
|
||||
os.path.relpath(fs_pattern, fs_base_path))
|
||||
]
|
||||
if allowed_extensions is not None:
|
||||
out = [
|
||||
filepath for filepath in matched_paths
|
||||
if any('.' + suffix in allowed_extensions
|
||||
for suffix in xbasename(filepath).split('.')[1:])
|
||||
]
|
||||
if len(out) < len(matched_paths):
|
||||
invalid_matched_files = list(set(matched_paths) - set(out))
|
||||
logger.info(
|
||||
f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: "
|
||||
f'{invalid_matched_files}')
|
||||
else:
|
||||
out = matched_paths
|
||||
if not out:
|
||||
error_msg = f"Unable to find '{pattern}'"
|
||||
if allowed_extensions is not None:
|
||||
error_msg += f' with any supported extension {list(allowed_extensions)}'
|
||||
raise FileNotFoundError(error_msg)
|
||||
return out
|
||||
|
||||
|
||||
def _get_data_patterns(
|
||||
base_path: str,
|
||||
download_config: Optional[DownloadConfig] = None
|
||||
) -> Dict[str, List[str]]:
|
||||
"""Get data file patterns for a dataset directory.
|
||||
|
||||
Tries ``SPLIT_PATTERN_SHARDED`` first, then falls back to
|
||||
``ALL_DEFAULT_PATTERNS``.
|
||||
"""
|
||||
resolver = partial(
|
||||
_resolve_pattern, base_path=base_path, download_config=download_config)
|
||||
try:
|
||||
return _get_data_files_patterns(resolver)
|
||||
except FileNotFoundError:
|
||||
raise EmptyDatasetError(
|
||||
f"The directory at {base_path} doesn't contain any data files"
|
||||
) from None
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# Repository file download helper
|
||||
# ===================================================================
|
||||
|
||||
|
||||
def _download_repo_file(
|
||||
repo_id: str,
|
||||
path_in_repo: str,
|
||||
download_config: DownloadConfig,
|
||||
revision: str,
|
||||
) -> str:
|
||||
"""Download a single file from a ModelScope dataset repository."""
|
||||
api = _get_hub_api()
|
||||
_namespace, _dataset_name = repo_id.split('/')
|
||||
endpoint = api.get_endpoint_for_read(
|
||||
repo_id=repo_id, repo_type=REPO_TYPE_DATASET)
|
||||
if download_config and download_config.download_desc is None:
|
||||
download_config.download_desc = f'Downloading [{path_in_repo}]'
|
||||
try:
|
||||
url_or_filename = api.get_dataset_file_url(
|
||||
file_name=path_in_repo,
|
||||
dataset_name=_dataset_name,
|
||||
namespace=_namespace,
|
||||
revision=revision,
|
||||
extension_filter=False,
|
||||
endpoint=endpoint,
|
||||
)
|
||||
repo_file_path = cached_path(
|
||||
url_or_filename=url_or_filename, download_config=download_config)
|
||||
except FileNotFoundError as e:
|
||||
repo_file_path = ''
|
||||
logger.error(e)
|
||||
return repo_file_path
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# Additional modules download (for script-based datasets)
|
||||
# ===================================================================
|
||||
|
||||
|
||||
def _download_additional_modules(
|
||||
name: str,
|
||||
dataset_name: str,
|
||||
namespace: str,
|
||||
revision: str,
|
||||
imports: Tuple[str, str, str, str],
|
||||
download_config: Optional[DownloadConfig],
|
||||
trust_remote_code: Optional[bool] = False,
|
||||
) -> List[Tuple[str, str]]:
|
||||
"""Download additional modules referenced by a dataset builder script.
|
||||
|
||||
Parses the import list produced by ``get_imports`` and downloads any
|
||||
internal (relative) or external modules. Library imports are validated
|
||||
but not downloaded.
|
||||
"""
|
||||
local_imports: List[Tuple[str, str]] = []
|
||||
library_imports: List[Tuple[str, str]] = []
|
||||
|
||||
has_remote_code = any(
|
||||
import_type in ('internal', 'external')
|
||||
for import_type, _, _, _ in imports)
|
||||
if has_remote_code and not trust_remote_code:
|
||||
raise ValueError(
|
||||
f'Loading {name} requires executing code from the repository. '
|
||||
'This is disabled by default for security reasons. '
|
||||
'If you trust the authors of this dataset, you can enable it with '
|
||||
'`trust_remote_code=True`.')
|
||||
|
||||
api = _get_hub_api()
|
||||
download_config = download_config.copy()
|
||||
if download_config.download_desc is None:
|
||||
download_config.download_desc = 'Downloading extra modules'
|
||||
|
||||
for import_type, import_name, import_path, sub_directory in imports:
|
||||
if import_type == 'library':
|
||||
library_imports.append((import_name, import_path))
|
||||
continue
|
||||
if import_name == name:
|
||||
raise ValueError(
|
||||
f'Error in the {name} script, importing relative {import_name} module '
|
||||
f'but {import_name} is the name of the script. '
|
||||
f"Please change relative import {import_name} to another name and add a '# From: URL_OR_PATH' "
|
||||
f'comment pointing to the original relative import file path.')
|
||||
if import_type == 'internal':
|
||||
file_name = import_path + '.py'
|
||||
url_or_filename = api.get_dataset_file_url(
|
||||
file_name=file_name,
|
||||
dataset_name=dataset_name,
|
||||
namespace=namespace,
|
||||
revision=revision,
|
||||
)
|
||||
elif import_type == 'external':
|
||||
url_or_filename = import_path
|
||||
else:
|
||||
raise ValueError('Wrong import_type')
|
||||
|
||||
local_import_path = cached_path(
|
||||
url_or_filename, download_config=download_config)
|
||||
if sub_directory is not None:
|
||||
local_import_path = os.path.join(local_import_path, sub_directory)
|
||||
local_imports.append((import_name, local_import_path))
|
||||
|
||||
# Validate library imports
|
||||
needs_to_be_installed = {}
|
||||
for library_import_name, library_import_path in library_imports:
|
||||
try:
|
||||
importlib.import_module(library_import_name)
|
||||
except ImportError:
|
||||
if library_import_name not in needs_to_be_installed or library_import_path != library_import_name:
|
||||
needs_to_be_installed[
|
||||
library_import_name] = library_import_path
|
||||
if needs_to_be_installed:
|
||||
_dependencies_str = 'dependencies' if len(
|
||||
needs_to_be_installed) > 1 else 'dependency'
|
||||
_them_str = 'them' if len(needs_to_be_installed) > 1 else 'it'
|
||||
if 'sklearn' in needs_to_be_installed:
|
||||
needs_to_be_installed['sklearn'] = 'scikit-learn'
|
||||
if 'Bio' in needs_to_be_installed:
|
||||
needs_to_be_installed['Bio'] = 'biopython'
|
||||
raise ImportError(
|
||||
f'To be able to use {name}, you need to install the following {_dependencies_str}: '
|
||||
f"{', '.join(needs_to_be_installed)}.\nPlease install {_them_str} using 'pip install "
|
||||
f"{' '.join(needs_to_be_installed.values())}' for instance.")
|
||||
return local_imports
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# Module factory: script-based (Hub)
|
||||
# ===================================================================
|
||||
|
||||
|
||||
def _load_script_module(
|
||||
repo_id: str,
|
||||
revision: str,
|
||||
download_config: DownloadConfig,
|
||||
download_mode=None,
|
||||
dynamic_modules_path: Optional[str] = None,
|
||||
trust_remote_code: Optional[bool] = None,
|
||||
) -> DatasetModule:
|
||||
"""Shared implementation for loading a dataset module from a Hub .py script.
|
||||
|
||||
Used by both ``get_module_with_script`` (monkey-patch for datasets<4.0) and
|
||||
``_compat_hub_script_module`` (compat shim for datasets>=4.0).
|
||||
"""
|
||||
_namespace, _dataset_name = repo_id.split('/')
|
||||
script_file_name = f'{_dataset_name}.py'
|
||||
|
||||
local_script_path = _download_repo_file(
|
||||
repo_id=repo_id,
|
||||
path_in_repo=script_file_name,
|
||||
download_config=download_config,
|
||||
revision=revision,
|
||||
)
|
||||
if not local_script_path:
|
||||
raise FileNotFoundError(
|
||||
f'Cannot find {script_file_name} in {repo_id} at revision {revision}.'
|
||||
)
|
||||
|
||||
dataset_readme_path = _download_repo_file(
|
||||
repo_id=repo_id,
|
||||
path_in_repo='README.md',
|
||||
download_config=download_config,
|
||||
revision=revision,
|
||||
)
|
||||
|
||||
imports = get_imports(local_script_path)
|
||||
local_imports = _download_additional_modules(
|
||||
name=repo_id,
|
||||
dataset_name=_dataset_name,
|
||||
namespace=_namespace,
|
||||
revision=revision,
|
||||
imports=imports,
|
||||
download_config=download_config,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
|
||||
additional_files = []
|
||||
if dataset_readme_path:
|
||||
additional_files.append(
|
||||
(config.REPOCARD_FILENAME, dataset_readme_path))
|
||||
|
||||
dynamic_modules_path = dynamic_modules_path or init_dynamic_modules()
|
||||
hash_val = files_to_hash([local_script_path]
|
||||
+ [loc[1] for loc in local_imports])
|
||||
importable_file_path = _get_importable_file_path(
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
module_namespace='datasets',
|
||||
subdirectory_name=hash_val,
|
||||
name=repo_id,
|
||||
)
|
||||
if not os.path.exists(importable_file_path):
|
||||
trust = resolve_trust_remote_code(
|
||||
trust_remote_code=trust_remote_code, repo_id=repo_id)
|
||||
if trust:
|
||||
logger.warning(
|
||||
f'Use trust_remote_code=True. Will invoke codes from {repo_id}. '
|
||||
'Please make sure that you can trust the external codes.')
|
||||
_create_importable_file(
|
||||
local_path=local_script_path,
|
||||
local_imports=local_imports,
|
||||
additional_files=additional_files,
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
module_namespace='datasets',
|
||||
subdirectory_name=hash_val,
|
||||
name=repo_id,
|
||||
download_mode=download_mode,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f'Loading {repo_id} requires executing the dataset script in that'
|
||||
' repo on your local machine. Make sure you have read the code there to avoid malicious use, then'
|
||||
' set the option `trust_remote_code=True` to remove this error.'
|
||||
)
|
||||
module_path, hash_val = _load_importable_file(
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
module_namespace='datasets',
|
||||
subdirectory_name=hash_val,
|
||||
name=repo_id,
|
||||
)
|
||||
importlib.invalidate_caches()
|
||||
|
||||
api = _get_hub_api()
|
||||
builder_kwargs = {
|
||||
'base_path': api.get_file_base_path(repo_id=repo_id),
|
||||
'repo_id': repo_id,
|
||||
}
|
||||
return DatasetModule(module_path, hash_val, builder_kwargs)
|
||||
|
||||
|
||||
def get_module_with_script(self) -> DatasetModule:
|
||||
"""Monkey-patch target for ``HubDatasetModuleFactoryWithScript.get_module`` (datasets<4.0)."""
|
||||
repo_id: str = self.name
|
||||
revision = self.download_config.storage_options.get(
|
||||
'revision', None) or DEFAULT_DATASET_REVISION
|
||||
return _load_script_module(
|
||||
repo_id=repo_id,
|
||||
revision=revision,
|
||||
download_config=self.download_config,
|
||||
download_mode=self.download_mode,
|
||||
dynamic_modules_path=self.dynamic_modules_path
|
||||
if self.dynamic_modules_path else None,
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
)
|
||||
|
||||
|
||||
def _compat_hub_script_module(
|
||||
path,
|
||||
revision=None,
|
||||
download_config=None,
|
||||
download_mode=None,
|
||||
dynamic_modules_path=None,
|
||||
trust_remote_code=None,
|
||||
) -> DatasetModule:
|
||||
"""Load a dataset module from a Hub repo .py script (compat for datasets>=4.0)."""
|
||||
return _load_script_module(
|
||||
repo_id=path,
|
||||
revision=revision or DEFAULT_DATASET_REVISION,
|
||||
download_config=download_config or DownloadConfig(),
|
||||
download_mode=download_mode,
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# Module factory: script-based (local)
|
||||
# ===================================================================
|
||||
|
||||
|
||||
def _compat_local_script_module(
|
||||
path,
|
||||
download_mode=None,
|
||||
dynamic_modules_path=None,
|
||||
trust_remote_code=None,
|
||||
) -> DatasetModule:
|
||||
"""Load a dataset module from a local .py script (compat for datasets>=4.0)."""
|
||||
local_path = path
|
||||
name = Path(path).stem
|
||||
|
||||
local_imports: List[Tuple[str, str]] = []
|
||||
imports = get_imports(local_path)
|
||||
for import_type, import_name, import_path, sub_directory in imports:
|
||||
if import_type == 'library':
|
||||
continue
|
||||
if import_type == 'internal':
|
||||
rel_path = os.path.join(
|
||||
os.path.dirname(local_path), import_path + '.py')
|
||||
if os.path.isfile(rel_path):
|
||||
local_imports.append((import_name, rel_path))
|
||||
elif os.path.isdir(
|
||||
os.path.join(os.path.dirname(local_path), import_path)):
|
||||
local_imports.append(
|
||||
(import_name,
|
||||
os.path.join(os.path.dirname(local_path), import_path)))
|
||||
elif import_type == 'external':
|
||||
dl_config = DownloadConfig()
|
||||
dl_config.download_desc = 'Downloading extra modules'
|
||||
local_import_path = cached_path(
|
||||
import_path, download_config=dl_config)
|
||||
if sub_directory is not None:
|
||||
local_import_path = os.path.join(local_import_path,
|
||||
sub_directory)
|
||||
local_imports.append((import_name, local_import_path))
|
||||
|
||||
dynamic_modules_path = dynamic_modules_path or init_dynamic_modules()
|
||||
hash_val = files_to_hash([local_path] + [loc[1] for loc in local_imports])
|
||||
importable_file_path = _get_importable_file_path(
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
module_namespace='datasets',
|
||||
subdirectory_name=hash_val,
|
||||
name=name,
|
||||
)
|
||||
if not os.path.exists(importable_file_path):
|
||||
trust = resolve_trust_remote_code(trust_remote_code, name)
|
||||
if trust:
|
||||
_create_importable_file(
|
||||
local_path=local_path,
|
||||
local_imports=local_imports,
|
||||
additional_files=[],
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
module_namespace='datasets',
|
||||
subdirectory_name=hash_val,
|
||||
name=name,
|
||||
download_mode=download_mode,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f'Loading {name} requires executing the dataset script. '
|
||||
'Set `trust_remote_code=True` to allow this.')
|
||||
module_path, hash_val = _load_importable_file(
|
||||
dynamic_modules_path=dynamic_modules_path,
|
||||
module_namespace='datasets',
|
||||
subdirectory_name=hash_val,
|
||||
name=name,
|
||||
)
|
||||
importlib.invalidate_caches()
|
||||
builder_kwargs = {
|
||||
'base_path': str(Path(path).resolve().parent),
|
||||
}
|
||||
return DatasetModule(module_path, hash_val, builder_kwargs)
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# Module factory: without script (Hub)
|
||||
# ===================================================================
|
||||
|
||||
|
||||
def get_module_without_script(self) -> DatasetModule:
|
||||
"""Monkey-patch target for ``HubDatasetModuleFactoryWithoutScript.get_module``."""
|
||||
revision = self.download_config.storage_options.get(
|
||||
'revision', None) or DEFAULT_DATASET_REVISION
|
||||
base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip(
|
||||
'/')
|
||||
|
||||
repo_id: str = self.name
|
||||
download_config = self.download_config.copy()
|
||||
|
||||
dataset_readme_path = _download_repo_file(
|
||||
repo_id=repo_id,
|
||||
path_in_repo='README.md',
|
||||
download_config=download_config,
|
||||
revision=revision,
|
||||
)
|
||||
|
||||
dataset_card_data = DatasetCard.load(
|
||||
Path(dataset_readme_path
|
||||
)).data if dataset_readme_path else DatasetCardData()
|
||||
subset_name: str = download_config.storage_options.get('name', None)
|
||||
|
||||
metadata_configs = MetadataConfigs.from_dataset_card_data(
|
||||
dataset_card_data)
|
||||
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
|
||||
|
||||
if self.data_files is not None:
|
||||
patterns = sanitize_patterns(self.data_files)
|
||||
elif metadata_configs and 'data_files' in next(
|
||||
iter(metadata_configs.values())):
|
||||
if subset_name is not None:
|
||||
subset_data_files = metadata_configs[subset_name]['data_files']
|
||||
else:
|
||||
subset_data_files = next(iter(
|
||||
metadata_configs.values()))['data_files']
|
||||
patterns = sanitize_patterns(subset_data_files)
|
||||
else:
|
||||
patterns = _get_data_patterns(
|
||||
base_path, download_config=self.download_config)
|
||||
|
||||
data_files_dict = DataFilesDict.from_patterns(
|
||||
patterns,
|
||||
base_path=base_path,
|
||||
allowed_extensions=ALL_ALLOWED_EXTENSIONS,
|
||||
download_config=self.download_config,
|
||||
)
|
||||
module_name, default_builder_kwargs = infer_module_for_data_files(
|
||||
data_files=data_files_dict,
|
||||
path=self.name,
|
||||
download_config=self.download_config,
|
||||
)
|
||||
|
||||
if hasattr(data_files_dict, 'filter'):
|
||||
data_files_dict = data_files_dict.filter(
|
||||
extensions=_MODULE_TO_EXTENSIONS[module_name])
|
||||
else:
|
||||
data_files_dict = data_files_dict.filter_extensions(
|
||||
_MODULE_TO_EXTENSIONS[module_name])
|
||||
|
||||
module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
|
||||
|
||||
if metadata_configs:
|
||||
supports_metadata = module_name in {'imagefolder', 'audiofolder'}
|
||||
create_builder_signature = inspect.signature(
|
||||
create_builder_configs_from_metadata_configs)
|
||||
in_args = {
|
||||
'module_path': module_path,
|
||||
'metadata_configs': metadata_configs,
|
||||
'base_path': base_path,
|
||||
'default_builder_kwargs': default_builder_kwargs,
|
||||
'download_config': self.download_config,
|
||||
}
|
||||
if 'supports_metadata' in create_builder_signature.parameters:
|
||||
in_args['supports_metadata'] = supports_metadata
|
||||
builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
|
||||
**in_args)
|
||||
else:
|
||||
builder_configs: List[BuilderConfig] = [
|
||||
import_main_class(module_path).BUILDER_CONFIG_CLASS(
|
||||
data_files=data_files_dict,
|
||||
**default_builder_kwargs,
|
||||
)
|
||||
]
|
||||
default_config_name = None
|
||||
|
||||
api = _get_hub_api()
|
||||
endpoint = api.get_endpoint_for_read(
|
||||
repo_id=repo_id, repo_type=REPO_TYPE_DATASET)
|
||||
|
||||
builder_kwargs = {
|
||||
'base_path':
|
||||
api.get_file_base_path(repo_id=repo_id, endpoint=endpoint),
|
||||
'repo_id': self.name,
|
||||
'dataset_name': camelcase_to_snakecase(Path(self.name).name),
|
||||
'data_files': data_files_dict,
|
||||
}
|
||||
download_config = self.download_config.copy()
|
||||
if download_config.download_desc is None:
|
||||
download_config.download_desc = 'Downloading metadata'
|
||||
|
||||
if default_config_name is None and len(dataset_infos) == 1:
|
||||
default_config_name = next(iter(dataset_infos))
|
||||
|
||||
return DatasetModule(
|
||||
module_path,
|
||||
revision,
|
||||
builder_kwargs,
|
||||
dataset_infos=dataset_infos,
|
||||
builder_configs_parameters=BuilderConfigsParameters(
|
||||
metadata_configs=metadata_configs,
|
||||
builder_configs=builder_configs,
|
||||
default_config_name=default_config_name,
|
||||
),
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
addict
|
||||
attrs
|
||||
datasets>=4.0.0,<=4.6.1
|
||||
datasets>=4.0.0,<=4.8.4
|
||||
einops
|
||||
oss2
|
||||
Pillow
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
addict
|
||||
attrs
|
||||
datasets>=4.0.0,<=4.6.1
|
||||
datasets>=4.0.0,<=4.8.4
|
||||
einops
|
||||
Pillow
|
||||
python-dateutil>=2.1
|
||||
|
||||
Reference in New Issue
Block a user