From f4217e61832503c579f88776cb90444add0ab5e2 Mon Sep 17 00:00:00 2001 From: "Xingjun.Wang" Date: Fri, 4 Jul 2025 10:59:07 +0800 Subject: [PATCH] Upgrade datasets (#1393) * upgrade datasets to 3.6.0 * fix mcp lint --- modelscope/hub/api.py | 33 ++-- modelscope/hub/mcp/__init__.py | 2 +- modelscope/hub/mcp/api.py | 142 +++++++++--------- .../msdatasets/utils/hf_datasets_util.py | 53 +++---- requirements/datasets.txt | 2 +- requirements/framework.txt | 2 +- 6 files changed, 117 insertions(+), 117 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 6fe0897e..ff423838 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -2,13 +2,13 @@ # yapf: disable import datetime +import fnmatch import functools import io import os import pickle import platform import re -import fnmatch import shutil import tempfile import uuid @@ -86,6 +86,7 @@ from modelscope.utils.thread_utils import thread_executor logger = get_logger() + class HubApi: """Model hub api interface. """ @@ -313,7 +314,6 @@ class HubApi: else: raise_for_http_status(r) - def get_endpoint_for_read(self, repo_id: str, *, @@ -846,7 +846,7 @@ class HubApi: model_id: str, revision: Optional[str] = DEFAULT_MODEL_REVISION, root: Optional[str] = None, - recursive: Optional[str] = False, + recursive: Optional[bool] = False, use_cookies: Union[bool, CookieJar] = False, headers: Optional[dict] = {}, endpoint: Optional[str] = None) -> List[dict]: @@ -856,7 +856,7 @@ class HubApi: model_id (str): The model id revision (Optional[str], optional): The branch or tag name. root (Optional[str], optional): The root path. Defaults to None. - recursive (Optional[str], optional): Is recursive list files. Defaults to False. + recursive (Optional[bool], optional): Is recursive list files. Defaults to False. use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, will load cookie from local. Defaults to False. headers: request headers @@ -2144,12 +2144,22 @@ class HubApi: # List all files in the repo if repo_type == REPO_TYPE_MODEL: - files = self.get_model_files(repo_id, revision=revision or DEFAULT_MODEL_REVISION, recursive=True, endpoint=endpoint) + files = self.get_model_files( + repo_id, + revision=revision or DEFAULT_MODEL_REVISION, + recursive=True, + endpoint=endpoint + ) file_list = [f['Path'] for f in files] else: namespace, dataset_name = repo_id.split('/') dataset_hub_id, _ = self.get_dataset_id_and_type(dataset_name, namespace, endpoint=endpoint) - dataset_info = self.get_dataset_infos(dataset_hub_id, revision or DEFAULT_DATASET_REVISION, recursive='True', endpoint=endpoint) + dataset_info = self.get_dataset_infos( + dataset_hub_id, + revision or DEFAULT_DATASET_REVISION, + recursive='True', + endpoint=endpoint + ) files = dataset_info.get('Data', {}).get('Files', []) file_list = [f['Path'] for f in files] @@ -2166,16 +2176,16 @@ class HubApi: try: if repo_type == REPO_TYPE_MODEL: owner, repo_name = repo_id.split('/') - url = f"{endpoint}/api/v1/models/{owner}/{repo_name}/file" + url = f'{endpoint}/api/v1/models/{owner}/{repo_name}/file' params = { - "Revision": revision or DEFAULT_MODEL_REVISION, - "FilePath": path + 'Revision': revision or DEFAULT_MODEL_REVISION, + 'FilePath': path } else: owner, dataset_name = repo_id.split('/') - url = f"{endpoint}/api/v1/datasets/{owner}/{dataset_name}/repo" + url = f'{endpoint}/api/v1/datasets/{owner}/{dataset_name}/repo' params = { - "FilePath": path + 'FilePath': path } r = self.session.delete(url, params=params, cookies=cookies, headers=headers) raise_for_http_status(r) @@ -2193,7 +2203,6 @@ class HubApi: } - class ModelScopeConfig: path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) COOKIES_FILE_NAME = 'cookies' diff --git a/modelscope/hub/mcp/__init__.py b/modelscope/hub/mcp/__init__.py index d88e0816..1f1e7ed5 100644 --- a/modelscope/hub/mcp/__init__.py +++ b/modelscope/hub/mcp/__init__.py @@ -2,4 +2,4 @@ from .api import McpApi -__all__ = ['McpApi'] +__all__ = ['McpApi'] diff --git a/modelscope/hub/mcp/api.py b/modelscope/hub/mcp/api.py index 6555009e..1308b061 100644 --- a/modelscope/hub/mcp/api.py +++ b/modelscope/hub/mcp/api.py @@ -1,22 +1,24 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Optional import requests -from typing import Any, Dict, Optional from modelscope.hub.errors import raise_for_http_status -from .types import McpFilter, validate_mcp_filter, validate_filter_params +from modelscope.utils.logger import get_logger + +logger = get_logger() # MCP API path suffix -MCP_API_PATH = "/openapi/v1" +MCP_API_PATH = '/openapi/v1' class McpApi: """MCP (Model Context Protocol) API interface class""" - + def __init__(self, base_api): """ Initialize MCP API - + Args: base_api: HubApi instance for accessing basic API functionality """ @@ -27,18 +29,16 @@ class McpApi: self.builder_headers = base_api.builder_headers self.headers = base_api.headers - def get_mcp_servers( - self, - token: str, - filter: dict = None, - page_number: int = 1, - page_size: int = 20, - search: str = "", - endpoint: Optional[str] = None - ) -> dict: + def get_mcp_servers(self, + token: str, + filter: dict = None, + page_number: int = 1, + page_size: int = 20, + search: str = '', + endpoint: Optional[str] = None) -> dict: """ Get MCP server list - + Args: token: Authentication token filter: Filter condition dictionary containing the following sub-branches: @@ -50,7 +50,7 @@ class McpApi: page_size: Page size, defaults to 20 search: Search keyword, defaults to empty string endpoint: API endpoint, defaults to MCP-specific endpoint (inherited from HubApi + /openapi/v1) - + Returns: dict: Dictionary containing MCP server list - mcp_server_list: Detailed MCP server list @@ -59,51 +59,50 @@ class McpApi: """ if not endpoint: endpoint = self.endpoint - url = f"{endpoint}/mcp/servers" + url = f'{endpoint}/mcp/servers' headers = self.builder_headers(self.headers) - headers["Authorization"] = f"Bearer {token}" + headers['Authorization'] = f'Bearer {token}' body = { - "filter": filter or {}, - "page_number": page_number, - "page_size": page_size, - "search": search + 'filter': filter or {}, + 'page_number': page_number, + 'page_size': page_size, + 'search': search } r = self.session.put(url, headers=headers, json=body) raise_for_http_status(r) - + try: resp = r.json() except requests.exceptions.JSONDecodeError: - print("JSON parsing failed") - print("Response content:", r.text) + logger.error( + f'Failed to parse JSON response from MCP server list API: {r.text}' + ) raise - data = resp.get("data", {}) - mcp_server_list = data.get("mcp_server_list", []) - server_brief_list = [ - {"name": item.get("name", ""), "description": item.get("description", "")} - for item in mcp_server_list - ] + data = resp.get('data', {}) + mcp_server_list = data.get('mcp_server_list', []) + server_brief_list = [{ + 'name': item.get('name', ''), + 'description': item.get('description', '') + } for item in mcp_server_list] return { - "mcp_server_list": mcp_server_list, - "total_count": data.get("total_count", 0), - "server_brief_list": server_brief_list + 'mcp_server_list': mcp_server_list, + 'total_count': data.get('total_count', 0), + 'server_brief_list': server_brief_list } - def get_mcp_server_operational( - self, - token: str, - endpoint: Optional[str] = None - ) -> dict: + def get_mcp_server_operational(self, + token: str, + endpoint: Optional[str] = None) -> dict: """ Get user-hosted MCP server list - + Args: token: Authentication token endpoint: API endpoint, defaults to MCP-specific endpoint (inherited from HubApi + /openapi/v1) - + Returns: dict: Dictionary containing MCP server list - mcp_server_list: Detailed MCP server list @@ -112,58 +111,58 @@ class McpApi: """ if not endpoint: endpoint = self.endpoint - url = f"{endpoint}/mcp/servers/operational" + url = f'{endpoint}/mcp/servers/operational' headers = self.builder_headers(self.headers) - headers["Authorization"] = f"Bearer {token}" + headers['Authorization'] = f'Bearer {token}' r = self.session.get(url, headers=headers) raise_for_http_status(r) - print(r.status_code) try: resp = r.json() except requests.exceptions.JSONDecodeError: - print("JSON parsing failed") - print("Response content:", r.text) + logger.error( + f'Failed to parse JSON response from MCP server operational API: {r.text}' + ) raise - data = resp.get("data", {}) - mcp_server_list = data.get("mcp_server_list", []) - server_brief_list = [ - {"name": item.get("name", ""), "description": item.get("description", "")} - for item in mcp_server_list - ] + data = resp.get('data', {}) + mcp_server_list = data.get('mcp_server_list', []) + server_brief_list = [{ + 'name': item.get('name', ''), + 'description': item.get('description', '') + } for item in mcp_server_list] return { - "mcp_server_list": mcp_server_list, - "total_count": data.get("total_count", 0), - "server_brief_list": server_brief_list + 'mcp_server_list': mcp_server_list, + 'total_count': data.get('total_count', 0), + 'server_brief_list': server_brief_list } - - def get_mcp_server_special( - self, - server_id: str, - token: str, - get_operational_url: bool = False, - endpoint: Optional[str] = None - ) -> dict: + + def get_mcp_server_special(self, + server_id: str, + token: str, + get_operational_url: bool = False, + endpoint: Optional[str] = None) -> dict: """ Get specific MCP server details - + Args: server_id: Server ID token: Authentication token get_operational_url: Whether to get operational URL, defaults to False endpoint: API endpoint, defaults to MCP-specific endpoint (inherited from HubApi + /openapi/v1) - + Returns: dict: Dictionary containing MCP server details """ if not endpoint: endpoint = self.endpoint - url = f"{endpoint}/mcp/servers/{server_id}" + url = f'{endpoint}/mcp/servers/{server_id}' headers = self.builder_headers(self.headers) - headers["Authorization"] = f"Bearer {token}" - params = {"get_operational_url": str(get_operational_url).lower()} if get_operational_url else {} + headers['Authorization'] = f'Bearer {token}' + params = { + 'get_operational_url': str(get_operational_url).lower() + } if get_operational_url else {} r = self.session.get(url, headers=headers, params=params) raise_for_http_status(r) @@ -171,7 +170,8 @@ class McpApi: try: resp = r.json() except requests.exceptions.JSONDecodeError: - print("JSON parsing failed") - print("Response content:", r.text) + logger.error( + f'Failed to parse JSON response from MCP server special API: {r.text}' + ) raise - return resp.get("data", {}) + return resp.get('data', {}) diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py index 33a60db7..a76bfbce 100644 --- a/modelscope/msdatasets/utils/hf_datasets_util.py +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -3,6 +3,7 @@ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. import importlib import contextlib +import inspect import os import warnings from functools import partial @@ -17,9 +18,9 @@ from datasets import (BuilderConfig, Dataset, DatasetBuilder, DatasetDict, IterableDataset, IterableDatasetDict, Split, VerificationMode, Version, config, data_files) from datasets.data_files import ( - FILES_TO_IGNORE, DataFilesDict, DataFilesList, EmptyDatasetError, + FILES_TO_IGNORE, DataFilesDict, EmptyDatasetError, _get_data_files_patterns, _is_inside_unrequested_special_dir, - _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir, get_metadata_patterns, sanitize_patterns) + _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir, sanitize_patterns) from datasets.download.streaming_download_manager import ( _prepare_path_and_storage_options, xbasename, xjoin) from datasets.exceptions import DataFilesNotFoundError, DatasetNotFoundError @@ -37,7 +38,6 @@ from datasets.load import ( init_dynamic_modules) from datasets.naming import camelcase_to_snakecase from datasets.packaged_modules import (_EXTENSION_TO_MODULE, - _MODULE_SUPPORTS_METADATA, _MODULE_TO_EXTENSIONS, _PACKAGED_DATASETS_MODULES) from datasets.utils import file_utils @@ -625,38 +625,29 @@ def get_module_without_script(self) -> DatasetModule: path=self.name, download_config=self.download_config, ) - data_files = data_files.filter_extensions( - _MODULE_TO_EXTENSIONS[module_name]) - # Collect metadata files if the module supports them - supports_metadata = module_name in _MODULE_SUPPORTS_METADATA - if self.data_files is None and supports_metadata: - try: - metadata_patterns = get_metadata_patterns( - base_path, download_config=self.download_config) - except FileNotFoundError: - metadata_patterns = None - if metadata_patterns is not None: - metadata_data_files_list = DataFilesList.from_patterns( - metadata_patterns, - download_config=self.download_config, - base_path=base_path) - if metadata_data_files_list: - data_files = DataFilesDict({ - split: data_files_list + metadata_data_files_list - for split, data_files_list in data_files.items() - }) + + if hasattr(data_files, 'filter'): + data_files = data_files.filter(extensions=_MODULE_TO_EXTENSIONS[module_name]) + else: + data_files = data_files.filter_extensions(_MODULE_TO_EXTENSIONS[module_name]) module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] if metadata_configs: - builder_configs, default_config_name = create_builder_configs_from_metadata_configs( - module_path, - metadata_configs, - base_path=base_path, - supports_metadata=supports_metadata, - default_builder_kwargs=default_builder_kwargs, - download_config=self.download_config, - ) + + supports_metadata = module_name in {'imagefolder', 'audiofolder'} + create_builder_signature = inspect.signature(create_builder_configs_from_metadata_configs) + in_args = { + 'module_path': module_path, + 'metadata_configs': metadata_configs, + 'base_path': base_path, + 'default_builder_kwargs': default_builder_kwargs, + 'download_config': self.download_config, + } + if 'supports_metadata' in create_builder_signature.parameters: + in_args['supports_metadata'] = supports_metadata + + builder_configs, default_config_name = create_builder_configs_from_metadata_configs(**in_args) else: builder_configs: List[BuilderConfig] = [ import_main_class(module_path).BUILDER_CONFIG_CLASS( diff --git a/requirements/datasets.txt b/requirements/datasets.txt index 3c6bac2c..d225d0de 100644 --- a/requirements/datasets.txt +++ b/requirements/datasets.txt @@ -1,6 +1,6 @@ addict attrs -datasets>=3.0.0,<=3.2.0 +datasets>=3.0.0,<=3.6.0 einops oss2 Pillow diff --git a/requirements/framework.txt b/requirements/framework.txt index d9330342..e15b1b03 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -1,6 +1,6 @@ addict attrs -datasets>=3.0.0,<=3.2.0 +datasets>=3.0.0,<=3.6.0 einops Pillow python-dateutil>=2.1