fix datasets dependency for snapshot download (#1581)

This commit is contained in:
Xingjun.Wang
2025-12-23 17:26:16 +08:00
committed by GitHub
parent a7f50d4516
commit e4aace1500
5 changed files with 15 additions and 6 deletions

View File

@@ -82,7 +82,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
DownloadChannel, DownloadMode,
Frameworks, ModelFile, Tasks,
VirgoDatasetConfig)
from modelscope.utils.file_utils import get_file_hash, get_file_size
from modelscope.utils.file_utils import (get_file_hash, get_file_size,
is_relative_path)
from modelscope.utils.logger import get_logger
from modelscope.utils.repo_utils import (DATASET_LFS_SUFFIX,
DEFAULT_IGNORE_PATTERNS,
@@ -1461,7 +1462,6 @@ class HubApi:
>>> for commit in commit_history.commits:
... print(f"{commit.short_id}: {commit.title}")
"""
from datasets.utils.file_utils import is_relative_path
if is_relative_path(repo_id) and repo_id.count('/') == 1:
_owner, _dataset_name = repo_id.split('/')
@@ -1520,7 +1520,6 @@ class HubApi:
List: The response containing the dataset repository tree information.
e.g. [{'CommitId': None, 'CommitMessage': '...', 'Size': 0, 'Type': 'tree'}, ...]
"""
from datasets.utils.file_utils import is_relative_path
if is_relative_path(repo_id) and repo_id.count('/') == 1:
_owner, _dataset_name = repo_id.split('/')

View File

@@ -3,10 +3,11 @@
from datasets.download.download_manager import DownloadManager
from datasets.download.streaming_download_manager import \
StreamingDownloadManager
from datasets.utils.file_utils import cached_path, is_relative_path
from datasets.utils.file_utils import cached_path
from modelscope.msdatasets.download.download_config import DataDownloadConfig
from modelscope.msdatasets.utils.oss_utils import OssUtilities
from modelscope.utils.file_utils import is_relative_path
class DataDownloadManager(DownloadManager):

View File

@@ -9,7 +9,6 @@ import numpy as np
from datasets import (Dataset, DatasetDict, Features, IterableDataset,
IterableDatasetDict)
from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
from datasets.utils.file_utils import is_relative_path
from modelscope.hub.repository import DatasetRepository
from modelscope.msdatasets.context.dataset_context_config import \
@@ -32,6 +31,7 @@ from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
REPO_TYPE_DATASET, ConfigFields,
DatasetFormations, DownloadMode, Hubs,
ModeKeys, Tasks, UploadMode)
from modelscope.utils.file_utils import is_relative_path
from modelscope.utils.import_utils import is_tf_available, is_torch_available
from modelscope.utils.logger import get_logger

View File

@@ -46,7 +46,6 @@ from datasets.packaged_modules import (_EXTENSION_TO_MODULE,
from datasets.utils import file_utils
from datasets.utils.file_utils import (_raise_if_offline_mode_is_enabled,
cached_path, is_local_path,
is_relative_path,
relative_to_absolute_path)
from datasets.utils.info_utils import is_small_dataset
from datasets.utils.metadata import MetadataConfigs
@@ -68,6 +67,7 @@ from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms
from modelscope.utils.config_ds import MS_DATASETS_CACHE
from modelscope.utils.constant import DEFAULT_DATASET_REVISION, REPO_TYPE_DATASET
from modelscope.utils.import_utils import has_attr_in_class
from modelscope.utils.file_utils import is_relative_path
from modelscope.utils.logger import get_logger
logger = get_logger()

View File

@@ -6,6 +6,7 @@ import os
from pathlib import Path
from shutil import Error, copy2, copystat
from typing import BinaryIO, Optional, Union
from urllib.parse import urlparse
# TODO: remove this api, unify to flattened args
@@ -274,3 +275,11 @@ def get_file_hash(
'chunk_nums': len(chunk_hash_list),
'chunk_hash_list': chunk_hash_list,
}
def is_relative_path(url_or_filename: str) -> bool:
"""
Check if a given string is a relative path.
"""
return urlparse(
url_or_filename).scheme == '' and not os.path.isabs(url_or_filename)