fix features for datasets<=3.6.0

2025-12-16 16:27:45 +01:00 · 2025-08-06 15:10:56 +08:00
parent 595f3ea263
commit 924ad0822a
1 changed files with 80 additions and 3 deletions
--- a/modelscope/msdatasets/utils/hf_datasets_util.py
+++ b/modelscope/msdatasets/utils/hf_datasets_util.py
@@ -6,9 +6,10 @@ import contextlib
 import inspect
 import os
 import warnings
+from dataclasses import dataclass, field, fields
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, List, Mapping, Optional, Sequence, Union, Tuple, Literal
+from typing import Dict, Iterable, List, Mapping, Optional, Sequence, Union, Tuple, Literal, Any, ClassVar

 from urllib.parse import urlencode

@@ -16,7 +17,9 @@ import requests
 from datasets import (BuilderConfig, Dataset, DatasetBuilder, DatasetDict,
                      DownloadConfig, DownloadManager, DownloadMode, Features,
                      IterableDataset, IterableDatasetDict, Split,
-                      VerificationMode, Version, config, data_files)
+                      VerificationMode, Version, config, data_files, LargeList, Sequence as SequenceHf)
+from datasets.features import features
+from datasets.features.features import _FEATURE_TYPES
 from datasets.data_files import (
    FILES_TO_IGNORE, DataFilesDict, EmptyDatasetError,
    _get_data_files_patterns, _is_inside_unrequested_special_dir,
@@ -49,6 +52,7 @@ from datasets.utils.info_utils import is_small_dataset
 from datasets.utils.metadata import MetadataConfigs
 from datasets.utils.py_utils import get_imports
 from datasets.utils.track import tracked_str
+
 from fsspec import filesystem
 from fsspec.core import _un_chain
 from fsspec.utils import stringify_path
@@ -62,7 +66,7 @@ from modelscope import HubApi
 from modelscope.hub.utils.utils import get_endpoint
 from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms
 from modelscope.utils.config_ds import MS_DATASETS_CACHE
-from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DEFAULT_DATASET_REVISION, REPO_TYPE_DATASET
+from modelscope.utils.constant import DEFAULT_DATASET_REVISION, REPO_TYPE_DATASET
 from modelscope.utils.import_utils import has_attr_in_class
 from modelscope.utils.logger import get_logger

@@ -89,6 +93,76 @@ ExpandDatasetProperty_T = Literal[
 ]


+# Patch datasets features
+@dataclass(repr=False)
+class ListMs(SequenceHf):
+    """Feature type for large list data composed of child feature data type.
+
+    It is backed by `pyarrow.ListType`, which uses 32-bit offsets or a fixed length.
+
+    Args:
+        feature ([`FeatureType`]):
+            Child feature data type of each item within the large list.
+        length (optional `int`, default to -1):
+            Length of the list if it is fixed.
+            Defaults to -1 which means an arbitrary length.
+    """
+
+    feature: Any
+    length: int = -1
+    id: Optional[str] = field(default=None, repr=False)
+    # Automatically constructed
+    pa_type: ClassVar[Any] = None
+    _type: str = field(default='List', init=False, repr=False)
+
+    def __repr__(self):
+        if self.length != -1:
+            return f'{type(self).__name__}({self.feature}, length={self.length})'
+        else:
+            return f'{type(self).__name__}({self.feature})'
+
+
+_FEATURE_TYPES['List'] = ListMs
+
+
+def generate_from_dict_ms(obj: Any):
+    """Regenerate the nested feature object from a deserialized dict.
+    We use the '_type' fields to get the dataclass name to load.
+
+    generate_from_dict is the recursive helper for Features.from_dict, and allows for a convenient constructor syntax
+    to define features from deserialized JSON dictionaries. This function is used in particular when deserializing
+    a :class:`DatasetInfo` that was dumped to a JSON object. This acts as an analogue to
+    :meth:`Features.from_arrow_schema` and handles the recursive field-by-field instantiation, but doesn't require any
+    mapping to/from pyarrow, except for the fact that it takes advantage of the mapping of pyarrow primitive dtypes
+    that :class:`Value` automatically performs.
+    """
+    # Nested structures: we allow dict, list/tuples, sequences
+    if isinstance(obj, list):
+        return [generate_from_dict_ms(value) for value in obj]
+    # Otherwise we have a dict or a dataclass
+    if '_type' not in obj or isinstance(obj['_type'], dict):
+        return {key: generate_from_dict_ms(value) for key, value in obj.items()}
+    obj = dict(obj)
+    _type = obj.pop('_type')
+    class_type = _FEATURE_TYPES.get(_type, None) or globals().get(_type, None)
+
+    if class_type is None:
+        raise ValueError(f"Feature type '{_type}' not found. Available feature types: {list(_FEATURE_TYPES.keys())}")
+
+    if class_type == LargeList:
+        feature = obj.pop('feature')
+        return LargeList(generate_from_dict_ms(feature), **obj)
+    if class_type == ListMs:
+        feature = obj.pop('feature')
+        return ListMs(generate_from_dict_ms(feature), **obj)
+    if class_type == SequenceHf:  # backward compatibility, this translates to a List or a dict
+        feature = obj.pop('feature')
+        return SequenceHf(feature=generate_from_dict_ms(feature), **obj)
+
+    field_names = {f.name for f in fields(class_type)}
+    return class_type(**{k: v for k, v in obj.items() if k in field_names})
+
+
 def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) -> str:
    url_or_filename = str(url_or_filename)
    # for temp val
@@ -1377,6 +1451,7 @@ def load_dataset_with_ctx(*args, **kwargs):
    resolve_pattern_origin = data_files.resolve_pattern
    get_module_without_script_origin = HubDatasetModuleFactoryWithoutScript.get_module
    get_module_with_script_origin = HubDatasetModuleFactoryWithScript.get_module
+    generate_from_dict_origin = features.generate_from_dict

    # Monkey patching with modelscope functions
    config.HF_ENDPOINT = get_endpoint()
@@ -1392,6 +1467,7 @@ def load_dataset_with_ctx(*args, **kwargs):
    data_files.resolve_pattern = _resolve_pattern
    HubDatasetModuleFactoryWithoutScript.get_module = get_module_without_script
    HubDatasetModuleFactoryWithScript.get_module = get_module_with_script
+    features.generate_from_dict = generate_from_dict_ms

    streaming = kwargs.get('streaming', False)

@@ -1402,6 +1478,7 @@ def load_dataset_with_ctx(*args, **kwargs):
        # Restore the original functions
        config.HF_ENDPOINT = hf_endpoint_origin
        file_utils.get_from_cache = get_from_cache_origin
+        features.generate_from_dict = generate_from_dict_origin
        # Keep the context during the streaming iteration
        if not streaming:
            config.HF_ENDPOINT = hf_endpoint_origin