Upgrade datasets (#921)

* del _datasets_server import in hf_dataset_util

* fix streaming for youku-mplug and adopt latest datasets

* fix download config copy

* update ut

* add youku in test_general_datasets

* update UT for general dataset

* adapt to datasets version: 2.19.0 or later

* add assert for youku data UT

* fix disable_tqdm in some functions for 2.19.0 or later

* update get_module_with_script

* set trust_remote_code is True in load_dataset_with_ctx

* update print info

* update requirements for datasets version restriction

* fix _dataset_info

* add pillow

* update comments

* update comment

* reuse _download function in DataDownloadManager

* remove unused code

* update test_run_modelhub in Human3DAnimationTest

* set datasets>=2.18.0
This commit is contained in:
Xingjun.Wang
2024-07-23 22:26:12 +08:00
committed by GitHub
parent 4e2555c5a3
commit 210ab40c54
13 changed files with 163 additions and 102 deletions

View File

@@ -169,17 +169,6 @@ class MsDatasetTest(unittest.TestCase):
'speech_asr_aishell1_trainsets', namespace='speech_asr')
print(next(iter(ms_ds_asr['train'])))
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@require_torch
def test_to_torch_dataset_img(self):
ms_image_train = MsDataset.load(
'fixtures_image_utils', namespace='damotest', split='test')
pt_dataset = ms_image_train.to_torch_dataset(
preprocessors=ImgPreprocessor(image_path='file'))
import torch
dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
print(next(iter(dataloader)))
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@require_tf
def test_to_tf_dataset_img(self):
@@ -229,7 +218,7 @@ class MsDatasetTest(unittest.TestCase):
print(data_example)
assert data_example.values()
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 3, 'skip test in current test level')
def test_streaming_load_img_object(self):
"""Test case for iterating PIL object."""
from PIL.PngImagePlugin import PngImageFile
@@ -238,7 +227,7 @@ class MsDatasetTest(unittest.TestCase):
subset_name='default',
namespace='huizheng',
split='train',
use_streaming=True)
use_streaming=False)
data_example = next(iter(dataset))
print(data_example)
assert data_example.values()
@@ -247,7 +236,8 @@ class MsDatasetTest(unittest.TestCase):
def test_to_ms_dataset(self):
"""Test case for converting huggingface dataset to `MsDataset` instance."""
from datasets.load import load_dataset
hf_dataset = load_dataset('beans', split='train', streaming=True)
hf_dataset = load_dataset(
'AI-Lab-Makerere/beans', split='train', streaming=True)
ms_dataset = MsDataset.to_ms_dataset(hf_dataset)
data_example = next(iter(ms_dataset))
print(data_example)