Upgrade datasets (#1393)

* upgrade datasets to 3.6.0

* fix mcp lint
This commit is contained in:
Xingjun.Wang
2025-07-04 10:59:07 +08:00
committed by GitHub
parent 88406c17ad
commit f4217e6183
6 changed files with 117 additions and 117 deletions

View File

@@ -2,13 +2,13 @@
# yapf: disable
import datetime
import fnmatch
import functools
import io
import os
import pickle
import platform
import re
import fnmatch
import shutil
import tempfile
import uuid
@@ -86,6 +86,7 @@ from modelscope.utils.thread_utils import thread_executor
logger = get_logger()
class HubApi:
"""Model hub api interface.
"""
@@ -313,7 +314,6 @@ class HubApi:
else:
raise_for_http_status(r)
def get_endpoint_for_read(self,
repo_id: str,
*,
@@ -846,7 +846,7 @@ class HubApi:
model_id: str,
revision: Optional[str] = DEFAULT_MODEL_REVISION,
root: Optional[str] = None,
recursive: Optional[str] = False,
recursive: Optional[bool] = False,
use_cookies: Union[bool, CookieJar] = False,
headers: Optional[dict] = {},
endpoint: Optional[str] = None) -> List[dict]:
@@ -856,7 +856,7 @@ class HubApi:
model_id (str): The model id
revision (Optional[str], optional): The branch or tag name.
root (Optional[str], optional): The root path. Defaults to None.
recursive (Optional[str], optional): Is recursive list files. Defaults to False.
recursive (Optional[bool], optional): Is recursive list files. Defaults to False.
use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
will load cookie from local. Defaults to False.
headers: request headers
@@ -2144,12 +2144,22 @@ class HubApi:
# List all files in the repo
if repo_type == REPO_TYPE_MODEL:
files = self.get_model_files(repo_id, revision=revision or DEFAULT_MODEL_REVISION, recursive=True, endpoint=endpoint)
files = self.get_model_files(
repo_id,
revision=revision or DEFAULT_MODEL_REVISION,
recursive=True,
endpoint=endpoint
)
file_list = [f['Path'] for f in files]
else:
namespace, dataset_name = repo_id.split('/')
dataset_hub_id, _ = self.get_dataset_id_and_type(dataset_name, namespace, endpoint=endpoint)
dataset_info = self.get_dataset_infos(dataset_hub_id, revision or DEFAULT_DATASET_REVISION, recursive='True', endpoint=endpoint)
dataset_info = self.get_dataset_infos(
dataset_hub_id,
revision or DEFAULT_DATASET_REVISION,
recursive='True',
endpoint=endpoint
)
files = dataset_info.get('Data', {}).get('Files', [])
file_list = [f['Path'] for f in files]
@@ -2166,16 +2176,16 @@ class HubApi:
try:
if repo_type == REPO_TYPE_MODEL:
owner, repo_name = repo_id.split('/')
url = f"{endpoint}/api/v1/models/{owner}/{repo_name}/file"
url = f'{endpoint}/api/v1/models/{owner}/{repo_name}/file'
params = {
"Revision": revision or DEFAULT_MODEL_REVISION,
"FilePath": path
'Revision': revision or DEFAULT_MODEL_REVISION,
'FilePath': path
}
else:
owner, dataset_name = repo_id.split('/')
url = f"{endpoint}/api/v1/datasets/{owner}/{dataset_name}/repo"
url = f'{endpoint}/api/v1/datasets/{owner}/{dataset_name}/repo'
params = {
"FilePath": path
'FilePath': path
}
r = self.session.delete(url, params=params, cookies=cookies, headers=headers)
raise_for_http_status(r)
@@ -2193,7 +2203,6 @@ class HubApi:
}
class ModelScopeConfig:
path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
COOKIES_FILE_NAME = 'cookies'

View File

@@ -2,4 +2,4 @@
from .api import McpApi
__all__ = ['McpApi']
__all__ = ['McpApi']

View File

@@ -1,22 +1,24 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Optional
import requests
from typing import Any, Dict, Optional
from modelscope.hub.errors import raise_for_http_status
from .types import McpFilter, validate_mcp_filter, validate_filter_params
from modelscope.utils.logger import get_logger
logger = get_logger()
# MCP API path suffix
MCP_API_PATH = "/openapi/v1"
MCP_API_PATH = '/openapi/v1'
class McpApi:
"""MCP (Model Context Protocol) API interface class"""
def __init__(self, base_api):
"""
Initialize MCP API
Args:
base_api: HubApi instance for accessing basic API functionality
"""
@@ -27,18 +29,16 @@ class McpApi:
self.builder_headers = base_api.builder_headers
self.headers = base_api.headers
def get_mcp_servers(
self,
token: str,
filter: dict = None,
page_number: int = 1,
page_size: int = 20,
search: str = "",
endpoint: Optional[str] = None
) -> dict:
def get_mcp_servers(self,
token: str,
filter: dict = None,
page_number: int = 1,
page_size: int = 20,
search: str = '',
endpoint: Optional[str] = None) -> dict:
"""
Get MCP server list
Args:
token: Authentication token
filter: Filter condition dictionary containing the following sub-branches:
@@ -50,7 +50,7 @@ class McpApi:
page_size: Page size, defaults to 20
search: Search keyword, defaults to empty string
endpoint: API endpoint, defaults to MCP-specific endpoint (inherited from HubApi + /openapi/v1)
Returns:
dict: Dictionary containing MCP server list
- mcp_server_list: Detailed MCP server list
@@ -59,51 +59,50 @@ class McpApi:
"""
if not endpoint:
endpoint = self.endpoint
url = f"{endpoint}/mcp/servers"
url = f'{endpoint}/mcp/servers'
headers = self.builder_headers(self.headers)
headers["Authorization"] = f"Bearer {token}"
headers['Authorization'] = f'Bearer {token}'
body = {
"filter": filter or {},
"page_number": page_number,
"page_size": page_size,
"search": search
'filter': filter or {},
'page_number': page_number,
'page_size': page_size,
'search': search
}
r = self.session.put(url, headers=headers, json=body)
raise_for_http_status(r)
try:
resp = r.json()
except requests.exceptions.JSONDecodeError:
print("JSON parsing failed")
print("Response content:", r.text)
logger.error(
f'Failed to parse JSON response from MCP server list API: {r.text}'
)
raise
data = resp.get("data", {})
mcp_server_list = data.get("mcp_server_list", [])
server_brief_list = [
{"name": item.get("name", ""), "description": item.get("description", "")}
for item in mcp_server_list
]
data = resp.get('data', {})
mcp_server_list = data.get('mcp_server_list', [])
server_brief_list = [{
'name': item.get('name', ''),
'description': item.get('description', '')
} for item in mcp_server_list]
return {
"mcp_server_list": mcp_server_list,
"total_count": data.get("total_count", 0),
"server_brief_list": server_brief_list
'mcp_server_list': mcp_server_list,
'total_count': data.get('total_count', 0),
'server_brief_list': server_brief_list
}
def get_mcp_server_operational(
self,
token: str,
endpoint: Optional[str] = None
) -> dict:
def get_mcp_server_operational(self,
token: str,
endpoint: Optional[str] = None) -> dict:
"""
Get user-hosted MCP server list
Args:
token: Authentication token
endpoint: API endpoint, defaults to MCP-specific endpoint (inherited from HubApi + /openapi/v1)
Returns:
dict: Dictionary containing MCP server list
- mcp_server_list: Detailed MCP server list
@@ -112,58 +111,58 @@ class McpApi:
"""
if not endpoint:
endpoint = self.endpoint
url = f"{endpoint}/mcp/servers/operational"
url = f'{endpoint}/mcp/servers/operational'
headers = self.builder_headers(self.headers)
headers["Authorization"] = f"Bearer {token}"
headers['Authorization'] = f'Bearer {token}'
r = self.session.get(url, headers=headers)
raise_for_http_status(r)
print(r.status_code)
try:
resp = r.json()
except requests.exceptions.JSONDecodeError:
print("JSON parsing failed")
print("Response content:", r.text)
logger.error(
f'Failed to parse JSON response from MCP server operational API: {r.text}'
)
raise
data = resp.get("data", {})
mcp_server_list = data.get("mcp_server_list", [])
server_brief_list = [
{"name": item.get("name", ""), "description": item.get("description", "")}
for item in mcp_server_list
]
data = resp.get('data', {})
mcp_server_list = data.get('mcp_server_list', [])
server_brief_list = [{
'name': item.get('name', ''),
'description': item.get('description', '')
} for item in mcp_server_list]
return {
"mcp_server_list": mcp_server_list,
"total_count": data.get("total_count", 0),
"server_brief_list": server_brief_list
'mcp_server_list': mcp_server_list,
'total_count': data.get('total_count', 0),
'server_brief_list': server_brief_list
}
def get_mcp_server_special(
self,
server_id: str,
token: str,
get_operational_url: bool = False,
endpoint: Optional[str] = None
) -> dict:
def get_mcp_server_special(self,
server_id: str,
token: str,
get_operational_url: bool = False,
endpoint: Optional[str] = None) -> dict:
"""
Get specific MCP server details
Args:
server_id: Server ID
token: Authentication token
get_operational_url: Whether to get operational URL, defaults to False
endpoint: API endpoint, defaults to MCP-specific endpoint (inherited from HubApi + /openapi/v1)
Returns:
dict: Dictionary containing MCP server details
"""
if not endpoint:
endpoint = self.endpoint
url = f"{endpoint}/mcp/servers/{server_id}"
url = f'{endpoint}/mcp/servers/{server_id}'
headers = self.builder_headers(self.headers)
headers["Authorization"] = f"Bearer {token}"
params = {"get_operational_url": str(get_operational_url).lower()} if get_operational_url else {}
headers['Authorization'] = f'Bearer {token}'
params = {
'get_operational_url': str(get_operational_url).lower()
} if get_operational_url else {}
r = self.session.get(url, headers=headers, params=params)
raise_for_http_status(r)
@@ -171,7 +170,8 @@ class McpApi:
try:
resp = r.json()
except requests.exceptions.JSONDecodeError:
print("JSON parsing failed")
print("Response content:", r.text)
logger.error(
f'Failed to parse JSON response from MCP server special API: {r.text}'
)
raise
return resp.get("data", {})
return resp.get('data', {})

View File

@@ -3,6 +3,7 @@
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
import importlib
import contextlib
import inspect
import os
import warnings
from functools import partial
@@ -17,9 +18,9 @@ from datasets import (BuilderConfig, Dataset, DatasetBuilder, DatasetDict,
IterableDataset, IterableDatasetDict, Split,
VerificationMode, Version, config, data_files)
from datasets.data_files import (
FILES_TO_IGNORE, DataFilesDict, DataFilesList, EmptyDatasetError,
FILES_TO_IGNORE, DataFilesDict, EmptyDatasetError,
_get_data_files_patterns, _is_inside_unrequested_special_dir,
_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir, get_metadata_patterns, sanitize_patterns)
_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir, sanitize_patterns)
from datasets.download.streaming_download_manager import (
_prepare_path_and_storage_options, xbasename, xjoin)
from datasets.exceptions import DataFilesNotFoundError, DatasetNotFoundError
@@ -37,7 +38,6 @@ from datasets.load import (
init_dynamic_modules)
from datasets.naming import camelcase_to_snakecase
from datasets.packaged_modules import (_EXTENSION_TO_MODULE,
_MODULE_SUPPORTS_METADATA,
_MODULE_TO_EXTENSIONS,
_PACKAGED_DATASETS_MODULES)
from datasets.utils import file_utils
@@ -625,38 +625,29 @@ def get_module_without_script(self) -> DatasetModule:
path=self.name,
download_config=self.download_config,
)
data_files = data_files.filter_extensions(
_MODULE_TO_EXTENSIONS[module_name])
# Collect metadata files if the module supports them
supports_metadata = module_name in _MODULE_SUPPORTS_METADATA
if self.data_files is None and supports_metadata:
try:
metadata_patterns = get_metadata_patterns(
base_path, download_config=self.download_config)
except FileNotFoundError:
metadata_patterns = None
if metadata_patterns is not None:
metadata_data_files_list = DataFilesList.from_patterns(
metadata_patterns,
download_config=self.download_config,
base_path=base_path)
if metadata_data_files_list:
data_files = DataFilesDict({
split: data_files_list + metadata_data_files_list
for split, data_files_list in data_files.items()
})
if hasattr(data_files, 'filter'):
data_files = data_files.filter(extensions=_MODULE_TO_EXTENSIONS[module_name])
else:
data_files = data_files.filter_extensions(_MODULE_TO_EXTENSIONS[module_name])
module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
if metadata_configs:
builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
module_path,
metadata_configs,
base_path=base_path,
supports_metadata=supports_metadata,
default_builder_kwargs=default_builder_kwargs,
download_config=self.download_config,
)
supports_metadata = module_name in {'imagefolder', 'audiofolder'}
create_builder_signature = inspect.signature(create_builder_configs_from_metadata_configs)
in_args = {
'module_path': module_path,
'metadata_configs': metadata_configs,
'base_path': base_path,
'default_builder_kwargs': default_builder_kwargs,
'download_config': self.download_config,
}
if 'supports_metadata' in create_builder_signature.parameters:
in_args['supports_metadata'] = supports_metadata
builder_configs, default_config_name = create_builder_configs_from_metadata_configs(**in_args)
else:
builder_configs: List[BuilderConfig] = [
import_main_class(module_path).BUILDER_CONFIG_CLASS(

View File

@@ -1,6 +1,6 @@
addict
attrs
datasets>=3.0.0,<=3.2.0
datasets>=3.0.0,<=3.6.0
einops
oss2
Pillow

View File

@@ -1,6 +1,6 @@
addict
attrs
datasets>=3.0.0,<=3.2.0
datasets>=3.0.0,<=3.6.0
einops
Pillow
python-dateutil>=2.1