Upgrade datasets (#1393)

* upgrade datasets to 3.6.0

* fix mcp lint
This commit is contained in:
Xingjun.Wang
2025-07-04 10:59:07 +08:00
committed by GitHub
parent 88406c17ad
commit f4217e6183
6 changed files with 117 additions and 117 deletions

View File

@@ -2,13 +2,13 @@
# yapf: disable # yapf: disable
import datetime import datetime
import fnmatch
import functools import functools
import io import io
import os import os
import pickle import pickle
import platform import platform
import re import re
import fnmatch
import shutil import shutil
import tempfile import tempfile
import uuid import uuid
@@ -86,6 +86,7 @@ from modelscope.utils.thread_utils import thread_executor
logger = get_logger() logger = get_logger()
class HubApi: class HubApi:
"""Model hub api interface. """Model hub api interface.
""" """
@@ -313,7 +314,6 @@ class HubApi:
else: else:
raise_for_http_status(r) raise_for_http_status(r)
def get_endpoint_for_read(self, def get_endpoint_for_read(self,
repo_id: str, repo_id: str,
*, *,
@@ -846,7 +846,7 @@ class HubApi:
model_id: str, model_id: str,
revision: Optional[str] = DEFAULT_MODEL_REVISION, revision: Optional[str] = DEFAULT_MODEL_REVISION,
root: Optional[str] = None, root: Optional[str] = None,
recursive: Optional[str] = False, recursive: Optional[bool] = False,
use_cookies: Union[bool, CookieJar] = False, use_cookies: Union[bool, CookieJar] = False,
headers: Optional[dict] = {}, headers: Optional[dict] = {},
endpoint: Optional[str] = None) -> List[dict]: endpoint: Optional[str] = None) -> List[dict]:
@@ -856,7 +856,7 @@ class HubApi:
model_id (str): The model id model_id (str): The model id
revision (Optional[str], optional): The branch or tag name. revision (Optional[str], optional): The branch or tag name.
root (Optional[str], optional): The root path. Defaults to None. root (Optional[str], optional): The root path. Defaults to None.
recursive (Optional[str], optional): Is recursive list files. Defaults to False. recursive (Optional[bool], optional): Is recursive list files. Defaults to False.
use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
will load cookie from local. Defaults to False. will load cookie from local. Defaults to False.
headers: request headers headers: request headers
@@ -2144,12 +2144,22 @@ class HubApi:
# List all files in the repo # List all files in the repo
if repo_type == REPO_TYPE_MODEL: if repo_type == REPO_TYPE_MODEL:
files = self.get_model_files(repo_id, revision=revision or DEFAULT_MODEL_REVISION, recursive=True, endpoint=endpoint) files = self.get_model_files(
repo_id,
revision=revision or DEFAULT_MODEL_REVISION,
recursive=True,
endpoint=endpoint
)
file_list = [f['Path'] for f in files] file_list = [f['Path'] for f in files]
else: else:
namespace, dataset_name = repo_id.split('/') namespace, dataset_name = repo_id.split('/')
dataset_hub_id, _ = self.get_dataset_id_and_type(dataset_name, namespace, endpoint=endpoint) dataset_hub_id, _ = self.get_dataset_id_and_type(dataset_name, namespace, endpoint=endpoint)
dataset_info = self.get_dataset_infos(dataset_hub_id, revision or DEFAULT_DATASET_REVISION, recursive='True', endpoint=endpoint) dataset_info = self.get_dataset_infos(
dataset_hub_id,
revision or DEFAULT_DATASET_REVISION,
recursive='True',
endpoint=endpoint
)
files = dataset_info.get('Data', {}).get('Files', []) files = dataset_info.get('Data', {}).get('Files', [])
file_list = [f['Path'] for f in files] file_list = [f['Path'] for f in files]
@@ -2166,16 +2176,16 @@ class HubApi:
try: try:
if repo_type == REPO_TYPE_MODEL: if repo_type == REPO_TYPE_MODEL:
owner, repo_name = repo_id.split('/') owner, repo_name = repo_id.split('/')
url = f"{endpoint}/api/v1/models/{owner}/{repo_name}/file" url = f'{endpoint}/api/v1/models/{owner}/{repo_name}/file'
params = { params = {
"Revision": revision or DEFAULT_MODEL_REVISION, 'Revision': revision or DEFAULT_MODEL_REVISION,
"FilePath": path 'FilePath': path
} }
else: else:
owner, dataset_name = repo_id.split('/') owner, dataset_name = repo_id.split('/')
url = f"{endpoint}/api/v1/datasets/{owner}/{dataset_name}/repo" url = f'{endpoint}/api/v1/datasets/{owner}/{dataset_name}/repo'
params = { params = {
"FilePath": path 'FilePath': path
} }
r = self.session.delete(url, params=params, cookies=cookies, headers=headers) r = self.session.delete(url, params=params, cookies=cookies, headers=headers)
raise_for_http_status(r) raise_for_http_status(r)
@@ -2193,7 +2203,6 @@ class HubApi:
} }
class ModelScopeConfig: class ModelScopeConfig:
path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
COOKIES_FILE_NAME = 'cookies' COOKIES_FILE_NAME = 'cookies'

View File

@@ -1,13 +1,15 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Optional
import requests import requests
from typing import Any, Dict, Optional
from modelscope.hub.errors import raise_for_http_status from modelscope.hub.errors import raise_for_http_status
from .types import McpFilter, validate_mcp_filter, validate_filter_params from modelscope.utils.logger import get_logger
logger = get_logger()
# MCP API path suffix # MCP API path suffix
MCP_API_PATH = "/openapi/v1" MCP_API_PATH = '/openapi/v1'
class McpApi: class McpApi:
@@ -27,15 +29,13 @@ class McpApi:
self.builder_headers = base_api.builder_headers self.builder_headers = base_api.builder_headers
self.headers = base_api.headers self.headers = base_api.headers
def get_mcp_servers( def get_mcp_servers(self,
self,
token: str, token: str,
filter: dict = None, filter: dict = None,
page_number: int = 1, page_number: int = 1,
page_size: int = 20, page_size: int = 20,
search: str = "", search: str = '',
endpoint: Optional[str] = None endpoint: Optional[str] = None) -> dict:
) -> dict:
""" """
Get MCP server list Get MCP server list
@@ -59,15 +59,15 @@ class McpApi:
""" """
if not endpoint: if not endpoint:
endpoint = self.endpoint endpoint = self.endpoint
url = f"{endpoint}/mcp/servers" url = f'{endpoint}/mcp/servers'
headers = self.builder_headers(self.headers) headers = self.builder_headers(self.headers)
headers["Authorization"] = f"Bearer {token}" headers['Authorization'] = f'Bearer {token}'
body = { body = {
"filter": filter or {}, 'filter': filter or {},
"page_number": page_number, 'page_number': page_number,
"page_size": page_size, 'page_size': page_size,
"search": search 'search': search
} }
r = self.session.put(url, headers=headers, json=body) r = self.session.put(url, headers=headers, json=body)
@@ -76,27 +76,26 @@ class McpApi:
try: try:
resp = r.json() resp = r.json()
except requests.exceptions.JSONDecodeError: except requests.exceptions.JSONDecodeError:
print("JSON parsing failed") logger.error(
print("Response content:", r.text) f'Failed to parse JSON response from MCP server list API: {r.text}'
)
raise raise
data = resp.get("data", {}) data = resp.get('data', {})
mcp_server_list = data.get("mcp_server_list", []) mcp_server_list = data.get('mcp_server_list', [])
server_brief_list = [ server_brief_list = [{
{"name": item.get("name", ""), "description": item.get("description", "")} 'name': item.get('name', ''),
for item in mcp_server_list 'description': item.get('description', '')
] } for item in mcp_server_list]
return { return {
"mcp_server_list": mcp_server_list, 'mcp_server_list': mcp_server_list,
"total_count": data.get("total_count", 0), 'total_count': data.get('total_count', 0),
"server_brief_list": server_brief_list 'server_brief_list': server_brief_list
} }
def get_mcp_server_operational( def get_mcp_server_operational(self,
self,
token: str, token: str,
endpoint: Optional[str] = None endpoint: Optional[str] = None) -> dict:
) -> dict:
""" """
Get user-hosted MCP server list Get user-hosted MCP server list
@@ -112,40 +111,38 @@ class McpApi:
""" """
if not endpoint: if not endpoint:
endpoint = self.endpoint endpoint = self.endpoint
url = f"{endpoint}/mcp/servers/operational" url = f'{endpoint}/mcp/servers/operational'
headers = self.builder_headers(self.headers) headers = self.builder_headers(self.headers)
headers["Authorization"] = f"Bearer {token}" headers['Authorization'] = f'Bearer {token}'
r = self.session.get(url, headers=headers) r = self.session.get(url, headers=headers)
raise_for_http_status(r) raise_for_http_status(r)
print(r.status_code)
try: try:
resp = r.json() resp = r.json()
except requests.exceptions.JSONDecodeError: except requests.exceptions.JSONDecodeError:
print("JSON parsing failed") logger.error(
print("Response content:", r.text) f'Failed to parse JSON response from MCP server operational API: {r.text}'
)
raise raise
data = resp.get("data", {}) data = resp.get('data', {})
mcp_server_list = data.get("mcp_server_list", []) mcp_server_list = data.get('mcp_server_list', [])
server_brief_list = [ server_brief_list = [{
{"name": item.get("name", ""), "description": item.get("description", "")} 'name': item.get('name', ''),
for item in mcp_server_list 'description': item.get('description', '')
] } for item in mcp_server_list]
return { return {
"mcp_server_list": mcp_server_list, 'mcp_server_list': mcp_server_list,
"total_count": data.get("total_count", 0), 'total_count': data.get('total_count', 0),
"server_brief_list": server_brief_list 'server_brief_list': server_brief_list
} }
def get_mcp_server_special( def get_mcp_server_special(self,
self,
server_id: str, server_id: str,
token: str, token: str,
get_operational_url: bool = False, get_operational_url: bool = False,
endpoint: Optional[str] = None endpoint: Optional[str] = None) -> dict:
) -> dict:
""" """
Get specific MCP server details Get specific MCP server details
@@ -160,10 +157,12 @@ class McpApi:
""" """
if not endpoint: if not endpoint:
endpoint = self.endpoint endpoint = self.endpoint
url = f"{endpoint}/mcp/servers/{server_id}" url = f'{endpoint}/mcp/servers/{server_id}'
headers = self.builder_headers(self.headers) headers = self.builder_headers(self.headers)
headers["Authorization"] = f"Bearer {token}" headers['Authorization'] = f'Bearer {token}'
params = {"get_operational_url": str(get_operational_url).lower()} if get_operational_url else {} params = {
'get_operational_url': str(get_operational_url).lower()
} if get_operational_url else {}
r = self.session.get(url, headers=headers, params=params) r = self.session.get(url, headers=headers, params=params)
raise_for_http_status(r) raise_for_http_status(r)
@@ -171,7 +170,8 @@ class McpApi:
try: try:
resp = r.json() resp = r.json()
except requests.exceptions.JSONDecodeError: except requests.exceptions.JSONDecodeError:
print("JSON parsing failed") logger.error(
print("Response content:", r.text) f'Failed to parse JSON response from MCP server special API: {r.text}'
)
raise raise
return resp.get("data", {}) return resp.get('data', {})

View File

@@ -3,6 +3,7 @@
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
import importlib import importlib
import contextlib import contextlib
import inspect
import os import os
import warnings import warnings
from functools import partial from functools import partial
@@ -17,9 +18,9 @@ from datasets import (BuilderConfig, Dataset, DatasetBuilder, DatasetDict,
IterableDataset, IterableDatasetDict, Split, IterableDataset, IterableDatasetDict, Split,
VerificationMode, Version, config, data_files) VerificationMode, Version, config, data_files)
from datasets.data_files import ( from datasets.data_files import (
FILES_TO_IGNORE, DataFilesDict, DataFilesList, EmptyDatasetError, FILES_TO_IGNORE, DataFilesDict, EmptyDatasetError,
_get_data_files_patterns, _is_inside_unrequested_special_dir, _get_data_files_patterns, _is_inside_unrequested_special_dir,
_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir, get_metadata_patterns, sanitize_patterns) _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir, sanitize_patterns)
from datasets.download.streaming_download_manager import ( from datasets.download.streaming_download_manager import (
_prepare_path_and_storage_options, xbasename, xjoin) _prepare_path_and_storage_options, xbasename, xjoin)
from datasets.exceptions import DataFilesNotFoundError, DatasetNotFoundError from datasets.exceptions import DataFilesNotFoundError, DatasetNotFoundError
@@ -37,7 +38,6 @@ from datasets.load import (
init_dynamic_modules) init_dynamic_modules)
from datasets.naming import camelcase_to_snakecase from datasets.naming import camelcase_to_snakecase
from datasets.packaged_modules import (_EXTENSION_TO_MODULE, from datasets.packaged_modules import (_EXTENSION_TO_MODULE,
_MODULE_SUPPORTS_METADATA,
_MODULE_TO_EXTENSIONS, _MODULE_TO_EXTENSIONS,
_PACKAGED_DATASETS_MODULES) _PACKAGED_DATASETS_MODULES)
from datasets.utils import file_utils from datasets.utils import file_utils
@@ -625,38 +625,29 @@ def get_module_without_script(self) -> DatasetModule:
path=self.name, path=self.name,
download_config=self.download_config, download_config=self.download_config,
) )
data_files = data_files.filter_extensions(
_MODULE_TO_EXTENSIONS[module_name]) if hasattr(data_files, 'filter'):
# Collect metadata files if the module supports them data_files = data_files.filter(extensions=_MODULE_TO_EXTENSIONS[module_name])
supports_metadata = module_name in _MODULE_SUPPORTS_METADATA else:
if self.data_files is None and supports_metadata: data_files = data_files.filter_extensions(_MODULE_TO_EXTENSIONS[module_name])
try:
metadata_patterns = get_metadata_patterns(
base_path, download_config=self.download_config)
except FileNotFoundError:
metadata_patterns = None
if metadata_patterns is not None:
metadata_data_files_list = DataFilesList.from_patterns(
metadata_patterns,
download_config=self.download_config,
base_path=base_path)
if metadata_data_files_list:
data_files = DataFilesDict({
split: data_files_list + metadata_data_files_list
for split, data_files_list in data_files.items()
})
module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
if metadata_configs: if metadata_configs:
builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
module_path, supports_metadata = module_name in {'imagefolder', 'audiofolder'}
metadata_configs, create_builder_signature = inspect.signature(create_builder_configs_from_metadata_configs)
base_path=base_path, in_args = {
supports_metadata=supports_metadata, 'module_path': module_path,
default_builder_kwargs=default_builder_kwargs, 'metadata_configs': metadata_configs,
download_config=self.download_config, 'base_path': base_path,
) 'default_builder_kwargs': default_builder_kwargs,
'download_config': self.download_config,
}
if 'supports_metadata' in create_builder_signature.parameters:
in_args['supports_metadata'] = supports_metadata
builder_configs, default_config_name = create_builder_configs_from_metadata_configs(**in_args)
else: else:
builder_configs: List[BuilderConfig] = [ builder_configs: List[BuilderConfig] = [
import_main_class(module_path).BUILDER_CONFIG_CLASS( import_main_class(module_path).BUILDER_CONFIG_CLASS(

View File

@@ -1,6 +1,6 @@
addict addict
attrs attrs
datasets>=3.0.0,<=3.2.0 datasets>=3.0.0,<=3.6.0
einops einops
oss2 oss2
Pillow Pillow

View File

@@ -1,6 +1,6 @@
addict addict
attrs attrs
datasets>=3.0.0,<=3.2.0 datasets>=3.0.0,<=3.6.0
einops einops
Pillow Pillow
python-dateutil>=2.1 python-dateutil>=2.1