[Fix] Fix split detection (#1714)

This commit is contained in:
Xingjun.Wang
2026-05-13 17:46:20 +08:00
committed by GitHub
parent a56e2ce08f
commit d968a2cf87

View File

@@ -473,6 +473,70 @@ def _hf_fs_open(self, path, mode='rb', **kwargs):
return _hf_fs_open_original(self, path, mode=mode, **kwargs)
class _DryRunDownloadManager:
"""Minimal download-manager stub for split discovery without I/O.
Returns placeholder paths for all download/extract calls, allowing
_split_generators() to execute and return SplitGenerator objects
(which carry split names) without triggering actual network or disk I/O.
"""
_PLACEHOLDER = os.devnull
def download(self, url_or_urls):
return self._map(url_or_urls)
def download_and_extract(self, url_or_urls):
return self._map(url_or_urls)
def extract(self, path_or_paths):
return self._map(path_or_paths)
def download_custom(self, url_or_urls, custom_download):
return self._map(url_or_urls)
def iter_archive(self, path):
return iter([])
def iter_files(self, paths):
return iter([])
@property
def manual_dir(self):
return None
@property
def is_streaming(self):
return False
def _map(self, input_):
if isinstance(input_, dict):
return {k: self._PLACEHOLDER for k in input_}
if isinstance(input_, (list, tuple, set)):
return type(input_)(self._PLACEHOLDER for _ in input_)
return self._PLACEHOLDER
def _discover_splits_from_builder(builder_instance):
"""Discover available split names by dry-running _split_generators().
For script-based datasets that lack README split metadata, this calls
the builder's _split_generators() with a stub download manager to
extract split names without performing any actual downloads.
Returns:
A set of split name strings, or an empty set if discovery fails.
"""
if not hasattr(builder_instance, '_split_generators'):
return set()
try:
generators = builder_instance._split_generators(_DryRunDownloadManager())
return {str(sg.name) for sg in generators}
except Exception as e:
logger.debug(f'Failed to discover splits from builder: {e}')
return set()
def _validate_split_exists(builder_instance, split):
"""Fail-fast check: raise ValueError before downloading if the
requested split does not exist in the dataset metadata.
@@ -493,20 +557,29 @@ def _validate_split_exists(builder_instance, split):
if not split_names:
return
# Prefer info.splits (original metadata); fall back to data_files keys
# Source 1: info.splits (original metadata from README)
available = set()
info = getattr(builder_instance, 'info', None)
if info is not None and info.splits:
available = set(info.splits.keys())
# Source 2: config.data_files keys
if not available:
config = getattr(builder_instance, 'config', None)
data_files = getattr(config, 'data_files', None)
if isinstance(data_files, dict):
available = set(data_files.keys())
# Source 3: dry-run _split_generators() for script-based datasets
if not available:
return # Cannot determine available splits; let downstream handle
available = _discover_splits_from_builder(builder_instance)
if not available:
logger.debug(
'Cannot determine available splits from dataset metadata; '
'split validation skipped. Invalid splits will be caught downstream.'
)
return
missing = split_names - available
if missing:
@@ -642,30 +715,58 @@ class DatasetsWrapperHF:
if dataset_info_only:
ret_dict = {}
# Case 1: Local .py script file
if isinstance(path, str) and path.endswith('.py') and os.path.exists(path):
from datasets import get_dataset_config_names
subset_list = get_dataset_config_names(path)
ret_dict = {_subset: [] for _subset in subset_list}
return ret_dict
if builder_instance is None or not hasattr(builder_instance,
'builder_configs'):
logger.error(f'No builder_configs found for {path} dataset.')
if builder_instance is None:
logger.error(f'No builder instance created for {path} dataset.')
return ret_dict
_tmp_builder_configs = builder_instance.builder_configs
for tmp_config_name, tmp_builder_config in _tmp_builder_configs.items():
tmp_config_name = str(tmp_config_name)
if hasattr(tmp_builder_config, 'data_files') and tmp_builder_config.data_files is not None:
ret_dict[tmp_config_name] = [str(item) for item in list(tmp_builder_config.data_files.keys())]
# Case 2: Try builder_configs with data_files (packaged datasets)
_tmp_builder_configs = getattr(builder_instance, 'builder_configs', None)
if _tmp_builder_configs:
if hasattr(_tmp_builder_configs, 'items'):
configs_iter = _tmp_builder_configs.items()
else:
ret_dict[tmp_config_name] = []
configs_iter = [(getattr(c, 'name', 'default'), c) for c in _tmp_builder_configs]
for tmp_config_name, tmp_builder_config in configs_iter:
tmp_config_name = str(tmp_config_name)
if hasattr(tmp_builder_config, 'data_files') and tmp_builder_config.data_files is not None:
ret_dict[tmp_config_name] = [str(item) for item in list(tmp_builder_config.data_files.keys())]
# Case 3: Fallback for script datasets — use info.splits or dry-run discovery
if not ret_dict or all(not v for v in ret_dict.values()):
config_name = getattr(builder_instance, 'config_name', 'default') or 'default'
splits = []
# Try info.splits (from README metadata)
info = getattr(builder_instance, 'info', None)
if info is not None and info.splits:
splits = sorted(info.splits.keys())
# Fallback: dry-run _split_generators()
if not splits:
discovered = _discover_splits_from_builder(builder_instance)
if discovered:
splits = sorted(discovered)
if splits:
ret_dict = {config_name: splits}
elif not ret_dict:
ret_dict = {config_name: []}
return ret_dict
_validate_split_exists(builder_instance, split)
if streaming:
return builder_instance.as_streaming_dataset(split=split)
_validate_split_exists(builder_instance, split)
_align_builder_splits_with_data_files(builder_instance, split)
builder_instance.download_and_prepare(