fix streaming for youku-mplug and adopt latest datasets

This commit is contained in:
xingjun.wang
2024-07-20 18:59:19 +08:00
parent ef5f7d6543
commit b4b7e29b28
5 changed files with 52 additions and 14 deletions

View File

@@ -1,2 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .ms_dataset import MsDataset
from modelscope.msdatasets.ms_dataset import MsDataset

View File

@@ -149,6 +149,7 @@ class NativeIterableDataset(IterableDataset):
if isinstance(ex_cache_path, str):
ex_cache_path = [ex_cache_path]
ret[k] = ex_cache_path
ret[k.strip(':FILE')] = v
except Exception as e:
logger.error(e)

View File

@@ -6,16 +6,15 @@ from datasets.download.download_config import DownloadConfig
class DataDownloadConfig(DownloadConfig):
"""
Extends `DownloadConfig` with additional attributes for data download.
"""
def __init__(self):
self.dataset_name: Optional[str] = None
self.namespace: Optional[str] = None
self.version: Optional[str] = None
self.split: Optional[Union[str, list]] = None
self.data_dir: Optional[str] = None
self.oss_config: Optional[dict] = {}
self.meta_args_map: Optional[dict] = {}
self.num_proc: int = 4
def copy(self) -> 'DataDownloadConfig':
return self
dataset_name: Optional[str] = None
namespace: Optional[str] = None
version: Optional[str] = None
split: Optional[Union[str, list]] = None
data_dir: Optional[str] = None
oss_config: Optional[dict] = {}
meta_args_map: Optional[dict] = {}
num_proc: int = 4

View File

@@ -36,6 +36,26 @@ class DataDownloadManager(DownloadManager):
return cached_path(
url_or_filename, download_config=download_config)
def _download_single(self, url_or_filename: str,
download_config: DataDownloadConfig) -> str:
# Note: _download_single is adapted to the datasets>=2.19.0
url_or_filename = str(url_or_filename)
oss_utilities = OssUtilities(
oss_config=download_config.oss_config,
dataset_name=download_config.dataset_name,
namespace=download_config.namespace,
revision=download_config.version)
if is_relative_path(url_or_filename):
# fetch oss files
return oss_utilities.download(
url_or_filename, download_config=download_config)
else:
return cached_path(
url_or_filename, download_config=download_config)
class DataStreamingDownloadManager(StreamingDownloadManager):
"""The data streaming download manager."""
@@ -62,3 +82,21 @@ class DataStreamingDownloadManager(StreamingDownloadManager):
else:
return cached_path(
url_or_filename, download_config=self.download_config)
def _download_single(self, url_or_filename: str) -> str:
# Note: _download_single is adapted to the datasets>=2.19.0
url_or_filename = str(url_or_filename)
oss_utilities = OssUtilities(
oss_config=self.download_config.oss_config,
dataset_name=self.download_config.dataset_name,
namespace=self.download_config.namespace,
revision=self.download_config.version)
if is_relative_path(url_or_filename):
# fetch oss files
return oss_utilities.download(
url_or_filename, download_config=self.download_config)
else:
return cached_path(
url_or_filename, download_config=self.download_config)

View File

@@ -1,6 +1,6 @@
addict
attrs
datasets>=2.16.0,<2.19.0
datasets>=2.19.0,<2.21.0
einops
oss2
python-dateutil>=2.1