From d968a2cf8726ec3208a161625b058fbbb7712623 Mon Sep 17 00:00:00 2001 From: "Xingjun.Wang" Date: Wed, 13 May 2026 17:46:20 +0800 Subject: [PATCH] [Fix] Fix split detection (#1714) --- .../msdatasets/utils/hf_datasets_util.py | 125 ++++++++++++++++-- 1 file changed, 113 insertions(+), 12 deletions(-) diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py index 40167606..8c89ecc9 100644 --- a/modelscope/msdatasets/utils/hf_datasets_util.py +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -473,6 +473,70 @@ def _hf_fs_open(self, path, mode='rb', **kwargs): return _hf_fs_open_original(self, path, mode=mode, **kwargs) +class _DryRunDownloadManager: + """Minimal download-manager stub for split discovery without I/O. + + Returns placeholder paths for all download/extract calls, allowing + _split_generators() to execute and return SplitGenerator objects + (which carry split names) without triggering actual network or disk I/O. + """ + + _PLACEHOLDER = os.devnull + + def download(self, url_or_urls): + return self._map(url_or_urls) + + def download_and_extract(self, url_or_urls): + return self._map(url_or_urls) + + def extract(self, path_or_paths): + return self._map(path_or_paths) + + def download_custom(self, url_or_urls, custom_download): + return self._map(url_or_urls) + + def iter_archive(self, path): + return iter([]) + + def iter_files(self, paths): + return iter([]) + + @property + def manual_dir(self): + return None + + @property + def is_streaming(self): + return False + + def _map(self, input_): + if isinstance(input_, dict): + return {k: self._PLACEHOLDER for k in input_} + if isinstance(input_, (list, tuple, set)): + return type(input_)(self._PLACEHOLDER for _ in input_) + return self._PLACEHOLDER + + +def _discover_splits_from_builder(builder_instance): + """Discover available split names by dry-running _split_generators(). + + For script-based datasets that lack README split metadata, this calls + the builder's _split_generators() with a stub download manager to + extract split names without performing any actual downloads. + + Returns: + A set of split name strings, or an empty set if discovery fails. + """ + if not hasattr(builder_instance, '_split_generators'): + return set() + try: + generators = builder_instance._split_generators(_DryRunDownloadManager()) + return {str(sg.name) for sg in generators} + except Exception as e: + logger.debug(f'Failed to discover splits from builder: {e}') + return set() + + def _validate_split_exists(builder_instance, split): """Fail-fast check: raise ValueError before downloading if the requested split does not exist in the dataset metadata. @@ -493,20 +557,29 @@ def _validate_split_exists(builder_instance, split): if not split_names: return - # Prefer info.splits (original metadata); fall back to data_files keys + # Source 1: info.splits (original metadata from README) available = set() info = getattr(builder_instance, 'info', None) if info is not None and info.splits: available = set(info.splits.keys()) + # Source 2: config.data_files keys if not available: config = getattr(builder_instance, 'config', None) data_files = getattr(config, 'data_files', None) if isinstance(data_files, dict): available = set(data_files.keys()) + # Source 3: dry-run _split_generators() for script-based datasets if not available: - return # Cannot determine available splits; let downstream handle + available = _discover_splits_from_builder(builder_instance) + + if not available: + logger.debug( + 'Cannot determine available splits from dataset metadata; ' + 'split validation skipped. Invalid splits will be caught downstream.' + ) + return missing = split_names - available if missing: @@ -642,30 +715,58 @@ class DatasetsWrapperHF: if dataset_info_only: ret_dict = {} + + # Case 1: Local .py script file if isinstance(path, str) and path.endswith('.py') and os.path.exists(path): from datasets import get_dataset_config_names subset_list = get_dataset_config_names(path) ret_dict = {_subset: [] for _subset in subset_list} return ret_dict - if builder_instance is None or not hasattr(builder_instance, - 'builder_configs'): - logger.error(f'No builder_configs found for {path} dataset.') + if builder_instance is None: + logger.error(f'No builder instance created for {path} dataset.') return ret_dict - _tmp_builder_configs = builder_instance.builder_configs - for tmp_config_name, tmp_builder_config in _tmp_builder_configs.items(): - tmp_config_name = str(tmp_config_name) - if hasattr(tmp_builder_config, 'data_files') and tmp_builder_config.data_files is not None: - ret_dict[tmp_config_name] = [str(item) for item in list(tmp_builder_config.data_files.keys())] + # Case 2: Try builder_configs with data_files (packaged datasets) + _tmp_builder_configs = getattr(builder_instance, 'builder_configs', None) + if _tmp_builder_configs: + if hasattr(_tmp_builder_configs, 'items'): + configs_iter = _tmp_builder_configs.items() else: - ret_dict[tmp_config_name] = [] + configs_iter = [(getattr(c, 'name', 'default'), c) for c in _tmp_builder_configs] + for tmp_config_name, tmp_builder_config in configs_iter: + tmp_config_name = str(tmp_config_name) + if hasattr(tmp_builder_config, 'data_files') and tmp_builder_config.data_files is not None: + ret_dict[tmp_config_name] = [str(item) for item in list(tmp_builder_config.data_files.keys())] + + # Case 3: Fallback for script datasets — use info.splits or dry-run discovery + if not ret_dict or all(not v for v in ret_dict.values()): + config_name = getattr(builder_instance, 'config_name', 'default') or 'default' + splits = [] + + # Try info.splits (from README metadata) + info = getattr(builder_instance, 'info', None) + if info is not None and info.splits: + splits = sorted(info.splits.keys()) + + # Fallback: dry-run _split_generators() + if not splits: + discovered = _discover_splits_from_builder(builder_instance) + if discovered: + splits = sorted(discovered) + + if splits: + ret_dict = {config_name: splits} + elif not ret_dict: + ret_dict = {config_name: []} + return ret_dict + _validate_split_exists(builder_instance, split) + if streaming: return builder_instance.as_streaming_dataset(split=split) - _validate_split_exists(builder_instance, split) _align_builder_splits_with_data_files(builder_instance, split) builder_instance.download_and_prepare(