mirror of
https://github.com/modelscope/modelscope.git
synced 2026-05-18 05:05:00 +02:00
[Fix] Fix split detection (#1714)
This commit is contained in:
@@ -473,6 +473,70 @@ def _hf_fs_open(self, path, mode='rb', **kwargs):
|
||||
return _hf_fs_open_original(self, path, mode=mode, **kwargs)
|
||||
|
||||
|
||||
class _DryRunDownloadManager:
|
||||
"""Minimal download-manager stub for split discovery without I/O.
|
||||
|
||||
Returns placeholder paths for all download/extract calls, allowing
|
||||
_split_generators() to execute and return SplitGenerator objects
|
||||
(which carry split names) without triggering actual network or disk I/O.
|
||||
"""
|
||||
|
||||
_PLACEHOLDER = os.devnull
|
||||
|
||||
def download(self, url_or_urls):
|
||||
return self._map(url_or_urls)
|
||||
|
||||
def download_and_extract(self, url_or_urls):
|
||||
return self._map(url_or_urls)
|
||||
|
||||
def extract(self, path_or_paths):
|
||||
return self._map(path_or_paths)
|
||||
|
||||
def download_custom(self, url_or_urls, custom_download):
|
||||
return self._map(url_or_urls)
|
||||
|
||||
def iter_archive(self, path):
|
||||
return iter([])
|
||||
|
||||
def iter_files(self, paths):
|
||||
return iter([])
|
||||
|
||||
@property
|
||||
def manual_dir(self):
|
||||
return None
|
||||
|
||||
@property
|
||||
def is_streaming(self):
|
||||
return False
|
||||
|
||||
def _map(self, input_):
|
||||
if isinstance(input_, dict):
|
||||
return {k: self._PLACEHOLDER for k in input_}
|
||||
if isinstance(input_, (list, tuple, set)):
|
||||
return type(input_)(self._PLACEHOLDER for _ in input_)
|
||||
return self._PLACEHOLDER
|
||||
|
||||
|
||||
def _discover_splits_from_builder(builder_instance):
|
||||
"""Discover available split names by dry-running _split_generators().
|
||||
|
||||
For script-based datasets that lack README split metadata, this calls
|
||||
the builder's _split_generators() with a stub download manager to
|
||||
extract split names without performing any actual downloads.
|
||||
|
||||
Returns:
|
||||
A set of split name strings, or an empty set if discovery fails.
|
||||
"""
|
||||
if not hasattr(builder_instance, '_split_generators'):
|
||||
return set()
|
||||
try:
|
||||
generators = builder_instance._split_generators(_DryRunDownloadManager())
|
||||
return {str(sg.name) for sg in generators}
|
||||
except Exception as e:
|
||||
logger.debug(f'Failed to discover splits from builder: {e}')
|
||||
return set()
|
||||
|
||||
|
||||
def _validate_split_exists(builder_instance, split):
|
||||
"""Fail-fast check: raise ValueError before downloading if the
|
||||
requested split does not exist in the dataset metadata.
|
||||
@@ -493,20 +557,29 @@ def _validate_split_exists(builder_instance, split):
|
||||
if not split_names:
|
||||
return
|
||||
|
||||
# Prefer info.splits (original metadata); fall back to data_files keys
|
||||
# Source 1: info.splits (original metadata from README)
|
||||
available = set()
|
||||
info = getattr(builder_instance, 'info', None)
|
||||
if info is not None and info.splits:
|
||||
available = set(info.splits.keys())
|
||||
|
||||
# Source 2: config.data_files keys
|
||||
if not available:
|
||||
config = getattr(builder_instance, 'config', None)
|
||||
data_files = getattr(config, 'data_files', None)
|
||||
if isinstance(data_files, dict):
|
||||
available = set(data_files.keys())
|
||||
|
||||
# Source 3: dry-run _split_generators() for script-based datasets
|
||||
if not available:
|
||||
return # Cannot determine available splits; let downstream handle
|
||||
available = _discover_splits_from_builder(builder_instance)
|
||||
|
||||
if not available:
|
||||
logger.debug(
|
||||
'Cannot determine available splits from dataset metadata; '
|
||||
'split validation skipped. Invalid splits will be caught downstream.'
|
||||
)
|
||||
return
|
||||
|
||||
missing = split_names - available
|
||||
if missing:
|
||||
@@ -642,30 +715,58 @@ class DatasetsWrapperHF:
|
||||
|
||||
if dataset_info_only:
|
||||
ret_dict = {}
|
||||
|
||||
# Case 1: Local .py script file
|
||||
if isinstance(path, str) and path.endswith('.py') and os.path.exists(path):
|
||||
from datasets import get_dataset_config_names
|
||||
subset_list = get_dataset_config_names(path)
|
||||
ret_dict = {_subset: [] for _subset in subset_list}
|
||||
return ret_dict
|
||||
|
||||
if builder_instance is None or not hasattr(builder_instance,
|
||||
'builder_configs'):
|
||||
logger.error(f'No builder_configs found for {path} dataset.')
|
||||
if builder_instance is None:
|
||||
logger.error(f'No builder instance created for {path} dataset.')
|
||||
return ret_dict
|
||||
|
||||
_tmp_builder_configs = builder_instance.builder_configs
|
||||
for tmp_config_name, tmp_builder_config in _tmp_builder_configs.items():
|
||||
tmp_config_name = str(tmp_config_name)
|
||||
if hasattr(tmp_builder_config, 'data_files') and tmp_builder_config.data_files is not None:
|
||||
ret_dict[tmp_config_name] = [str(item) for item in list(tmp_builder_config.data_files.keys())]
|
||||
# Case 2: Try builder_configs with data_files (packaged datasets)
|
||||
_tmp_builder_configs = getattr(builder_instance, 'builder_configs', None)
|
||||
if _tmp_builder_configs:
|
||||
if hasattr(_tmp_builder_configs, 'items'):
|
||||
configs_iter = _tmp_builder_configs.items()
|
||||
else:
|
||||
ret_dict[tmp_config_name] = []
|
||||
configs_iter = [(getattr(c, 'name', 'default'), c) for c in _tmp_builder_configs]
|
||||
for tmp_config_name, tmp_builder_config in configs_iter:
|
||||
tmp_config_name = str(tmp_config_name)
|
||||
if hasattr(tmp_builder_config, 'data_files') and tmp_builder_config.data_files is not None:
|
||||
ret_dict[tmp_config_name] = [str(item) for item in list(tmp_builder_config.data_files.keys())]
|
||||
|
||||
# Case 3: Fallback for script datasets — use info.splits or dry-run discovery
|
||||
if not ret_dict or all(not v for v in ret_dict.values()):
|
||||
config_name = getattr(builder_instance, 'config_name', 'default') or 'default'
|
||||
splits = []
|
||||
|
||||
# Try info.splits (from README metadata)
|
||||
info = getattr(builder_instance, 'info', None)
|
||||
if info is not None and info.splits:
|
||||
splits = sorted(info.splits.keys())
|
||||
|
||||
# Fallback: dry-run _split_generators()
|
||||
if not splits:
|
||||
discovered = _discover_splits_from_builder(builder_instance)
|
||||
if discovered:
|
||||
splits = sorted(discovered)
|
||||
|
||||
if splits:
|
||||
ret_dict = {config_name: splits}
|
||||
elif not ret_dict:
|
||||
ret_dict = {config_name: []}
|
||||
|
||||
return ret_dict
|
||||
|
||||
_validate_split_exists(builder_instance, split)
|
||||
|
||||
if streaming:
|
||||
return builder_instance.as_streaming_dataset(split=split)
|
||||
|
||||
_validate_split_exists(builder_instance, split)
|
||||
_align_builder_splits_with_data_files(builder_instance, split)
|
||||
|
||||
builder_instance.download_and_prepare(
|
||||
|
||||
Reference in New Issue
Block a user