mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 20:19:51 +01:00
fix streaming for youku-mplug and adopt latest datasets
This commit is contained in:
@@ -1,2 +1,2 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .ms_dataset import MsDataset
|
||||
from modelscope.msdatasets.ms_dataset import MsDataset
|
||||
|
||||
@@ -149,6 +149,7 @@ class NativeIterableDataset(IterableDataset):
|
||||
if isinstance(ex_cache_path, str):
|
||||
ex_cache_path = [ex_cache_path]
|
||||
ret[k] = ex_cache_path
|
||||
ret[k.strip(':FILE')] = v
|
||||
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
@@ -6,16 +6,15 @@ from datasets.download.download_config import DownloadConfig
|
||||
|
||||
|
||||
class DataDownloadConfig(DownloadConfig):
|
||||
"""
|
||||
Extends `DownloadConfig` with additional attributes for data download.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.dataset_name: Optional[str] = None
|
||||
self.namespace: Optional[str] = None
|
||||
self.version: Optional[str] = None
|
||||
self.split: Optional[Union[str, list]] = None
|
||||
self.data_dir: Optional[str] = None
|
||||
self.oss_config: Optional[dict] = {}
|
||||
self.meta_args_map: Optional[dict] = {}
|
||||
self.num_proc: int = 4
|
||||
|
||||
def copy(self) -> 'DataDownloadConfig':
|
||||
return self
|
||||
dataset_name: Optional[str] = None
|
||||
namespace: Optional[str] = None
|
||||
version: Optional[str] = None
|
||||
split: Optional[Union[str, list]] = None
|
||||
data_dir: Optional[str] = None
|
||||
oss_config: Optional[dict] = {}
|
||||
meta_args_map: Optional[dict] = {}
|
||||
num_proc: int = 4
|
||||
|
||||
@@ -36,6 +36,26 @@ class DataDownloadManager(DownloadManager):
|
||||
return cached_path(
|
||||
url_or_filename, download_config=download_config)
|
||||
|
||||
def _download_single(self, url_or_filename: str,
|
||||
download_config: DataDownloadConfig) -> str:
|
||||
# Note: _download_single is adapted to the datasets>=2.19.0
|
||||
|
||||
url_or_filename = str(url_or_filename)
|
||||
|
||||
oss_utilities = OssUtilities(
|
||||
oss_config=download_config.oss_config,
|
||||
dataset_name=download_config.dataset_name,
|
||||
namespace=download_config.namespace,
|
||||
revision=download_config.version)
|
||||
|
||||
if is_relative_path(url_or_filename):
|
||||
# fetch oss files
|
||||
return oss_utilities.download(
|
||||
url_or_filename, download_config=download_config)
|
||||
else:
|
||||
return cached_path(
|
||||
url_or_filename, download_config=download_config)
|
||||
|
||||
|
||||
class DataStreamingDownloadManager(StreamingDownloadManager):
|
||||
"""The data streaming download manager."""
|
||||
@@ -62,3 +82,21 @@ class DataStreamingDownloadManager(StreamingDownloadManager):
|
||||
else:
|
||||
return cached_path(
|
||||
url_or_filename, download_config=self.download_config)
|
||||
|
||||
def _download_single(self, url_or_filename: str) -> str:
|
||||
# Note: _download_single is adapted to the datasets>=2.19.0
|
||||
|
||||
url_or_filename = str(url_or_filename)
|
||||
oss_utilities = OssUtilities(
|
||||
oss_config=self.download_config.oss_config,
|
||||
dataset_name=self.download_config.dataset_name,
|
||||
namespace=self.download_config.namespace,
|
||||
revision=self.download_config.version)
|
||||
|
||||
if is_relative_path(url_or_filename):
|
||||
# fetch oss files
|
||||
return oss_utilities.download(
|
||||
url_or_filename, download_config=self.download_config)
|
||||
else:
|
||||
return cached_path(
|
||||
url_or_filename, download_config=self.download_config)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
addict
|
||||
attrs
|
||||
datasets>=2.16.0,<2.19.0
|
||||
datasets>=2.19.0,<2.21.0
|
||||
einops
|
||||
oss2
|
||||
python-dateutil>=2.1
|
||||
|
||||
Reference in New Issue
Block a user