mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-16 08:17:45 +01:00
[to #42794773]rename pydataset to msdataset
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9165402
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
modelscope.pydatasets package
|
||||
modelscope.msdatasets package
|
||||
=============================
|
||||
|
||||
.. automodule:: modelscope.pydatasets
|
||||
.. automodule:: modelscope.msdatasets
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
@@ -9,10 +9,10 @@ modelscope.pydatasets package
|
||||
Submodules
|
||||
----------
|
||||
|
||||
modelscope.pydatasets.py\_dataset module
|
||||
modelscope.msdatasets.ms\_dataset module
|
||||
----------------------------------------
|
||||
|
||||
.. automodule:: modelscope.pydatasets.py_dataset
|
||||
.. automodule:: modelscope.msdatasets.ms_dataset
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
@@ -16,7 +16,7 @@ Subpackages
|
||||
modelscope.models
|
||||
modelscope.pipelines
|
||||
modelscope.preprocessors
|
||||
modelscope.pydatasets
|
||||
modelscope.msdatasets
|
||||
modelscope.trainers
|
||||
modelscope.utils
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
## python环境配置
|
||||
首先,参考[文档](https://docs.anaconda.com/anaconda/install/) 安装配置Anaconda环境
|
||||
|
||||
安装完成后,执行如下命令为maas library创建对应的python环境。
|
||||
安装完成后,执行如下命令为modelscope library创建对应的python环境。
|
||||
```shell
|
||||
conda create -n modelscope python=3.6
|
||||
conda activate modelscope
|
||||
@@ -105,15 +105,15 @@ import cv2
|
||||
import os.path as osp
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.pydatasets import PyDataset
|
||||
from modelscope.msdatasets import MsDataset
|
||||
|
||||
# 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹
|
||||
# 使用图像url构建MsDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹
|
||||
input_location = [
|
||||
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
|
||||
]
|
||||
dataset = PyDataset.load(input_location, target='image')
|
||||
dataset = MsDataset.load(input_location, target='image')
|
||||
img_matting = pipeline(Tasks.image_matting, model='damo/image-matting-person')
|
||||
# 输入为PyDataset时,输出的结果为迭代器
|
||||
# 输入为MsDataset时,输出的结果为迭代器
|
||||
result = img_matting(dataset)
|
||||
cv2.imwrite('result.png', next(result)['output_png'])
|
||||
print(f'Output written to {osp.abspath("result.png")}')
|
||||
|
||||
@@ -187,7 +187,7 @@ def get_file_download_url(model_id: str, file_path: str, revision: str):
|
||||
"""
|
||||
Format file download url according to `model_id`, `revision` and `file_path`.
|
||||
e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`,
|
||||
the resulted download url is: https://maas.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
|
||||
the resulted download url is: https://modelscope.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
|
||||
"""
|
||||
download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}'
|
||||
return download_url_template.format(
|
||||
|
||||
1
modelscope/msdatasets/__init__.py
Normal file
1
modelscope/msdatasets/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .ms_dataset import MsDataset
|
||||
@@ -10,8 +10,8 @@ from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
|
||||
from datasets.utils.file_utils import (is_relative_path,
|
||||
relative_to_absolute_path)
|
||||
|
||||
from modelscope.pydatasets.config import MS_DATASETS_CACHE
|
||||
from modelscope.pydatasets.utils.ms_api import MsApi
|
||||
from modelscope.msdatasets.config import MS_DATASETS_CACHE
|
||||
from modelscope.msdatasets.utils.ms_api import MsApi
|
||||
from modelscope.utils.constant import Hubs
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
@@ -28,9 +28,9 @@ def format_list(para) -> List:
|
||||
return para
|
||||
|
||||
|
||||
class PyDataset:
|
||||
class MsDataset:
|
||||
_hf_ds = None # holds the underlying HuggingFace Dataset
|
||||
"""A PyDataset backed by hugging face Dataset."""
|
||||
"""A MsDataset backed by hugging face Dataset."""
|
||||
|
||||
def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
|
||||
self._hf_ds = hf_ds
|
||||
@@ -49,7 +49,7 @@ class PyDataset:
|
||||
@classmethod
|
||||
def from_hf_dataset(cls,
|
||||
hf_ds: Dataset,
|
||||
target: str = None) -> Union[dict, 'PyDataset']:
|
||||
target: str = None) -> Union[dict, 'MsDataset']:
|
||||
if isinstance(hf_ds, Dataset):
|
||||
return cls(hf_ds, target)
|
||||
if len(hf_ds.keys()) == 1:
|
||||
@@ -68,8 +68,8 @@ class PyDataset:
|
||||
data_files: Optional[Union[str, Sequence[str],
|
||||
Mapping[str, Union[str,
|
||||
Sequence[str]]]]] = None
|
||||
) -> Union[dict, 'PyDataset']:
|
||||
"""Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
|
||||
) -> Union[dict, 'MsDataset']:
|
||||
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
|
||||
Args:
|
||||
|
||||
dataset_name (str): Path or name of the dataset.
|
||||
@@ -82,7 +82,7 @@ class PyDataset:
|
||||
hub (Hubs, optional): When loading from a remote hub, where it is from
|
||||
|
||||
Returns:
|
||||
PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset.
|
||||
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
|
||||
"""
|
||||
if hub == Hubs.huggingface:
|
||||
dataset = hf_load_dataset(
|
||||
@@ -92,9 +92,9 @@ class PyDataset:
|
||||
split=split,
|
||||
data_dir=data_dir,
|
||||
data_files=data_files)
|
||||
return PyDataset.from_hf_dataset(dataset, target=target)
|
||||
return MsDataset.from_hf_dataset(dataset, target=target)
|
||||
else:
|
||||
return PyDataset._load_ms_dataset(
|
||||
return MsDataset._load_ms_dataset(
|
||||
dataset_name,
|
||||
target=target,
|
||||
subset_name=subset_name,
|
||||
@@ -114,7 +114,7 @@ class PyDataset:
|
||||
data_files: Optional[Union[str, Sequence[str],
|
||||
Mapping[str, Union[str,
|
||||
Sequence[str]]]]] = None
|
||||
) -> Union[dict, 'PyDataset']:
|
||||
) -> Union[dict, 'MsDataset']:
|
||||
if isinstance(dataset_name, str):
|
||||
use_hf = False
|
||||
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
|
||||
@@ -153,7 +153,7 @@ class PyDataset:
|
||||
else:
|
||||
raise TypeError('path must be a str or a list, but got'
|
||||
f' {type(dataset_name)}')
|
||||
return PyDataset.from_hf_dataset(dataset, target=target)
|
||||
return MsDataset.from_hf_dataset(dataset, target=target)
|
||||
|
||||
def to_torch_dataset_with_processors(
|
||||
self,
|
||||
@@ -4,7 +4,7 @@ from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH,
|
||||
from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
|
||||
MS_HUB_ENDPOINT)
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
@@ -6,15 +6,15 @@ from typing import Any, Dict, Generator, List, Union
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.models.base import Model
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.preprocessors import Preprocessor
|
||||
from modelscope.pydatasets import PyDataset
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .outputs import TASK_OUTPUTS
|
||||
from .util import is_model, is_official_hub_path
|
||||
|
||||
Tensor = Union['torch.Tensor', 'tf.Tensor']
|
||||
Input = Union[str, tuple, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
|
||||
Input = Union[str, tuple, MsDataset, 'PIL.Image.Image', 'numpy.ndarray']
|
||||
InputModel = Union[str, Model]
|
||||
|
||||
output_keys = [
|
||||
@@ -85,7 +85,7 @@ class Pipeline(ABC):
|
||||
for ele in input:
|
||||
output.append(self._process_single(ele, *args, **post_kwargs))
|
||||
|
||||
elif isinstance(input, PyDataset):
|
||||
elif isinstance(input, MsDataset):
|
||||
return self._process_iterator(input, *args, **post_kwargs)
|
||||
|
||||
else:
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from .py_dataset import PyDataset
|
||||
@@ -3,10 +3,9 @@ import unittest
|
||||
import datasets as hfdata
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.preprocessors import SequenceClassificationPreprocessor
|
||||
from modelscope.preprocessors.base import Preprocessor
|
||||
from modelscope.pydatasets import PyDataset
|
||||
from modelscope.utils.constant import Hubs
|
||||
from modelscope.utils.test_utils import require_tf, require_torch, test_level
|
||||
|
||||
|
||||
@@ -31,15 +30,15 @@ class ImgPreprocessor(Preprocessor):
|
||||
}
|
||||
|
||||
|
||||
class PyDatasetTest(unittest.TestCase):
|
||||
class MsDatasetTest(unittest.TestCase):
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_ds_basic(self):
|
||||
ms_ds_full = PyDataset.load('squad')
|
||||
ms_ds_full = MsDataset.load('squad')
|
||||
ms_ds_full_hf = hfdata.load_dataset('squad')
|
||||
ms_ds_train = PyDataset.load('squad', split='train')
|
||||
ms_ds_train = MsDataset.load('squad', split='train')
|
||||
ms_ds_train_hf = hfdata.load_dataset('squad', split='train')
|
||||
ms_image_train = PyDataset.from_hf_dataset(
|
||||
ms_image_train = MsDataset.from_hf_dataset(
|
||||
hfdata.load_dataset('beans', split='train'))
|
||||
self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0])
|
||||
self.assertEqual(ms_ds_full['validation'][0],
|
||||
@@ -58,7 +57,7 @@ class PyDatasetTest(unittest.TestCase):
|
||||
nlp_model.model_dir,
|
||||
first_sequence='context',
|
||||
second_sequence=None)
|
||||
ms_ds_train = PyDataset.load('squad', split='train')
|
||||
ms_ds_train = MsDataset.load('squad', split='train')
|
||||
pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor)
|
||||
import torch
|
||||
dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
|
||||
@@ -75,7 +74,7 @@ class PyDatasetTest(unittest.TestCase):
|
||||
nlp_model.model_dir,
|
||||
first_sequence='context',
|
||||
second_sequence=None)
|
||||
ms_ds_train = PyDataset.load('squad', split='train')
|
||||
ms_ds_train = MsDataset.load('squad', split='train')
|
||||
tf_dataset = ms_ds_train.to_tf_dataset(
|
||||
batch_size=5,
|
||||
shuffle=True,
|
||||
@@ -86,7 +85,7 @@ class PyDatasetTest(unittest.TestCase):
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
@require_torch
|
||||
def test_to_torch_dataset_img(self):
|
||||
ms_image_train = PyDataset.from_hf_dataset(
|
||||
ms_image_train = MsDataset.from_hf_dataset(
|
||||
hfdata.load_dataset('beans', split='train'))
|
||||
pt_dataset = ms_image_train.to_torch_dataset(
|
||||
preprocessors=ImgPreprocessor(
|
||||
@@ -100,7 +99,7 @@ class PyDatasetTest(unittest.TestCase):
|
||||
def test_to_tf_dataset_img(self):
|
||||
import tensorflow as tf
|
||||
tf.compat.v1.enable_eager_execution()
|
||||
ms_image_train = PyDataset.load('beans', split='train')
|
||||
ms_image_train = MsDataset.load('beans', split='train')
|
||||
tf_dataset = ms_image_train.to_tf_dataset(
|
||||
batch_size=5,
|
||||
shuffle=True,
|
||||
@@ -8,8 +8,8 @@ import unittest
|
||||
import cv2
|
||||
|
||||
from modelscope.fileio import File
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.pydatasets import PyDataset
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
@@ -7,8 +7,8 @@ import unittest
|
||||
import cv2
|
||||
|
||||
from modelscope.fileio import File
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.pydatasets import PyDataset
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
@@ -37,7 +37,7 @@ class ImageMattingTest(unittest.TestCase):
|
||||
# alternatively:
|
||||
# input_location = '/dir/to/images'
|
||||
|
||||
dataset = PyDataset.load(input_location, target='image')
|
||||
dataset = MsDataset.load(input_location, target='image')
|
||||
img_matting = pipeline(Tasks.image_matting, model=self.model_id)
|
||||
# note that for dataset output, the inference-output is a Generator that can be iterated.
|
||||
result = img_matting(dataset)
|
||||
@@ -62,7 +62,7 @@ class ImageMattingTest(unittest.TestCase):
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run_with_modelscope_dataset(self):
|
||||
dataset = PyDataset.load('beans', split='train', target='image')
|
||||
dataset = MsDataset.load('beans', split='train', target='image')
|
||||
img_matting = pipeline(Tasks.image_matting, model=self.model_id)
|
||||
result = img_matting(dataset)
|
||||
for i in range(10):
|
||||
|
||||
@@ -3,9 +3,9 @@ import shutil
|
||||
import unittest
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.pipelines import SequenceClassificationPipeline, pipeline
|
||||
from modelscope.preprocessors import SequenceClassificationPreprocessor
|
||||
from modelscope.pydatasets import PyDataset
|
||||
from modelscope.utils.constant import Hubs, Tasks
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
@@ -28,7 +28,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
|
||||
print(data)
|
||||
|
||||
def printDataset(self, dataset: PyDataset):
|
||||
def printDataset(self, dataset: MsDataset):
|
||||
for i, r in enumerate(dataset):
|
||||
if i > 10:
|
||||
break
|
||||
@@ -50,7 +50,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
text_classification = pipeline(
|
||||
task=Tasks.text_classification, model=self.model_id)
|
||||
result = text_classification(
|
||||
PyDataset.load(
|
||||
MsDataset.load(
|
||||
'glue',
|
||||
subset_name='sst2',
|
||||
split='train',
|
||||
@@ -62,7 +62,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
def test_run_with_default_model(self):
|
||||
text_classification = pipeline(task=Tasks.text_classification)
|
||||
result = text_classification(
|
||||
PyDataset.load(
|
||||
MsDataset.load(
|
||||
'glue',
|
||||
subset_name='sst2',
|
||||
split='train',
|
||||
@@ -78,7 +78,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
text_classification = pipeline(
|
||||
Tasks.text_classification, model=model, preprocessor=preprocessor)
|
||||
# loaded from huggingface dataset
|
||||
dataset = PyDataset.load(
|
||||
dataset = MsDataset.load(
|
||||
'glue',
|
||||
subset_name='sst2',
|
||||
split='train',
|
||||
@@ -91,7 +91,7 @@ class SequenceClassificationTest(unittest.TestCase):
|
||||
def test_run_with_modelscope_dataset(self):
|
||||
text_classification = pipeline(task=Tasks.text_classification)
|
||||
# loaded from modelscope dataset
|
||||
dataset = PyDataset.load(
|
||||
dataset = MsDataset.load(
|
||||
'squad', split='train', target='context', hub=Hubs.modelscope)
|
||||
result = text_classification(dataset)
|
||||
self.printDataset(result)
|
||||
|
||||
Reference in New Issue
Block a user