From 210ab40c546b67ba2de81f8e5fd4db43e9724d91 Mon Sep 17 00:00:00 2001
From: "Xingjun.Wang" <wangxingjun778@163.com>
Date: Tue, 23 Jul 2024 22:26:12 +0800
Subject: [PATCH] Upgrade datasets (#921)

* del _datasets_server import in hf_dataset_util

* fix streaming for youku-mplug and adopt latest datasets

* fix download config copy

* update ut

* add youku in test_general_datasets

* update UT for general dataset

* adapt to datasets version: 2.19.0 or later

* add assert for youku data UT

* fix disable_tqdm in some functions for 2.19.0 or later

* update get_module_with_script

* set trust_remote_code is True in load_dataset_with_ctx

* update print info

* update requirements for datasets version restriction

* fix _dataset_info

* add pillow

* update comments

* update comment

* reuse _download function in DataDownloadManager

* remove unused code

* update test_run_modelhub in Human3DAnimationTest

* set datasets>=2.18.0
---
 modelscope/msdatasets/__init__.py             |  2 +-
 modelscope/msdatasets/dataset_cls/dataset.py  |  1 +
 .../msdatasets/download/dataset_builder.py    |  1 +
 .../msdatasets/download/download_config.py    | 20 +++---
 .../msdatasets/download/download_manager.py   |  9 +++
 modelscope/msdatasets/ms_dataset.py           | 21 +++---
 .../msdatasets/utils/hf_datasets_util.py      | 67 +++++++++++------
 modelscope/msdatasets/utils/hf_file_utils.py  | 48 ++++++++++---
 requirements/datasets.txt                     |  3 +-
 requirements/framework.txt                    |  2 +-
 tests/msdatasets/test_general_datasets.py     | 71 +++++++++++--------
 tests/msdatasets/test_ms_dataset.py           | 18 ++---
 tests/pipelines/test_human3d_animation.py     |  2 +-
 13 files changed, 163 insertions(+), 102 deletions(-)

diff --git a/modelscope/msdatasets/__init__.py b/modelscope/msdatasets/__init__.py
index 70200e44..534a0500 100644
--- a/modelscope/msdatasets/__init__.py
+++ b/modelscope/msdatasets/__init__.py
@@ -1,2 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .ms_dataset import MsDataset
+from modelscope.msdatasets.ms_dataset import MsDataset
diff --git a/modelscope/msdatasets/dataset_cls/dataset.py b/modelscope/msdatasets/dataset_cls/dataset.py
index f9ffd9a7..9c1c7584 100644
--- a/modelscope/msdatasets/dataset_cls/dataset.py
+++ b/modelscope/msdatasets/dataset_cls/dataset.py
@@ -149,6 +149,7 @@ class NativeIterableDataset(IterableDataset):
                         if isinstance(ex_cache_path, str):
                             ex_cache_path = [ex_cache_path]
                         ret[k] = ex_cache_path
+                        ret[k.strip(':FILE')] = v
 
             except Exception as e:
                 logger.error(e)
diff --git a/modelscope/msdatasets/download/dataset_builder.py b/modelscope/msdatasets/download/dataset_builder.py
index 0c5c4154..84563668 100644
--- a/modelscope/msdatasets/download/dataset_builder.py
+++ b/modelscope/msdatasets/download/dataset_builder.py
@@ -330,6 +330,7 @@ class IterableDatasetBuilder(csv.Csv):
 
         super().__init__(
             cache_dir=self.cache_build_dir,
+            dataset_name=self.dataset_name,
             config_name=self.namespace,
             hash=sub_dir_hash,
             data_files=None,  # TODO: self.meta_data_files,
diff --git a/modelscope/msdatasets/download/download_config.py b/modelscope/msdatasets/download/download_config.py
index 11118f85..0fc95cd9 100644
--- a/modelscope/msdatasets/download/download_config.py
+++ b/modelscope/msdatasets/download/download_config.py
@@ -6,16 +6,18 @@ from datasets.download.download_config import DownloadConfig
 
 
 class DataDownloadConfig(DownloadConfig):
+    """
+    Extends `DownloadConfig` with additional attributes for data download.
+    """
 
-    def __init__(self):
-        self.dataset_name: Optional[str] = None
-        self.namespace: Optional[str] = None
-        self.version: Optional[str] = None
-        self.split: Optional[Union[str, list]] = None
-        self.data_dir: Optional[str] = None
-        self.oss_config: Optional[dict] = {}
-        self.meta_args_map: Optional[dict] = {}
-        self.num_proc: int = 4
+    dataset_name: Optional[str] = None
+    namespace: Optional[str] = None
+    version: Optional[str] = None
+    split: Optional[Union[str, list]] = None
+    data_dir: Optional[str] = None
+    oss_config: Optional[dict] = {}
+    meta_args_map: Optional[dict] = {}
+    num_proc: int = 4
 
     def copy(self) -> 'DataDownloadConfig':
         return self
diff --git a/modelscope/msdatasets/download/download_manager.py b/modelscope/msdatasets/download/download_manager.py
index 4799171a..5e36cdce 100644
--- a/modelscope/msdatasets/download/download_manager.py
+++ b/modelscope/msdatasets/download/download_manager.py
@@ -36,6 +36,11 @@ class DataDownloadManager(DownloadManager):
             return cached_path(
                 url_or_filename, download_config=download_config)
 
+    def _download_single(self, url_or_filename: str,
+                         download_config: DataDownloadConfig) -> str:
+        # Note: _download_single function is available for datasets>=2.19.0
+        return self._download(url_or_filename, download_config)
+
 
 class DataStreamingDownloadManager(StreamingDownloadManager):
     """The data streaming download manager."""
@@ -62,3 +67,7 @@ class DataStreamingDownloadManager(StreamingDownloadManager):
         else:
             return cached_path(
                 url_or_filename, download_config=self.download_config)
+
+    def _download_single(self, url_or_filename: str) -> str:
+        # Note: _download_single function is available for datasets>=2.19.0
+        return self._download(url_or_filename)
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 4b129698..b57a16ac 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -268,17 +268,15 @@ class MsDataset:
             return dataset_inst
         # Load from the huggingface hub
         elif hub == Hubs.huggingface:
-            dataset_inst = RemoteDataLoaderManager(
-                dataset_context_config).load_dataset(
-                    RemoteDataLoaderType.HF_DATA_LOADER)
-            dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target)
-            if isinstance(dataset_inst, MsDataset):
-                dataset_inst._dataset_context_config = dataset_context_config
-                if custom_cfg:
-                    dataset_inst.to_custom_dataset(
-                        custom_cfg=custom_cfg, **config_kwargs)
-                    dataset_inst.is_custom = True
-            return dataset_inst
+            from datasets import load_dataset
+            return load_dataset(
+                dataset_name,
+                name=subset_name,
+                split=split,
+                streaming=use_streaming,
+                download_mode=download_mode.value,
+                **config_kwargs)
+
         # Load from the modelscope hub
         elif hub == Hubs.modelscope:
 
@@ -305,6 +303,7 @@ class MsDataset:
                         token=token,
                         streaming=use_streaming,
                         dataset_info_only=dataset_info_only,
+                        trust_remote_code=True,
                         **config_kwargs) as dataset_res:
 
                     return dataset_res
diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py
index fca641fc..88b3fc20 100644
--- a/modelscope/msdatasets/utils/hf_datasets_util.py
+++ b/modelscope/msdatasets/utils/hf_datasets_util.py
@@ -7,7 +7,7 @@ import os
 import warnings
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, List, Mapping, Optional, Sequence, Union, Tuple
+from typing import Dict, Iterable, List, Mapping, Optional, Sequence, Union, Tuple, Literal
 
 from urllib.parse import urlencode
 
@@ -40,7 +40,7 @@ from datasets.packaged_modules import (_EXTENSION_TO_MODULE,
                                        _MODULE_SUPPORTS_METADATA,
                                        _MODULE_TO_EXTENSIONS,
                                        _PACKAGED_DATASETS_MODULES)
-from datasets.utils import _datasets_server, file_utils
+from datasets.utils import file_utils
 from datasets.utils.file_utils import (OfflineModeIsEnabled,
                                        _raise_if_offline_mode_is_enabled,
                                        cached_path, is_local_path,
@@ -68,6 +68,26 @@ from modelscope.utils.logger import get_logger
 logger = get_logger()
 
 
+ExpandDatasetProperty_T = Literal[
+    'author',
+    'cardData',
+    'citation',
+    'createdAt',
+    'disabled',
+    'description',
+    'downloads',
+    'downloadsAllTime',
+    'gated',
+    'lastModified',
+    'likes',
+    'paperswithcode_id',
+    'private',
+    'siblings',
+    'sha',
+    'tags',
+]
+
+
 def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) -> str:
     url_or_filename = str(url_or_filename)
     # for temp val
@@ -97,6 +117,7 @@ def _dataset_info(
     timeout: Optional[float] = None,
     files_metadata: bool = False,
     token: Optional[Union[bool, str]] = None,
+    expand: Optional[List[ExpandDatasetProperty_T]] = None,
 ) -> HfDatasetInfo:
     """
     Get info on one specific dataset on huggingface.co.
@@ -728,19 +749,6 @@ def _download_additional_modules(
 
 
 def get_module_with_script(self) -> DatasetModule:
-    if config.HF_DATASETS_TRUST_REMOTE_CODE and self.trust_remote_code is None:
-        warnings.warn(
-            f'The repository for {self.name} contains custom code which must be executed to correctly '
-            f'load the dataset. You can inspect the repository content at https://hf.co/datasets/{self.name}\n'
-            f'You can avoid this message in future by passing the argument `trust_remote_code=True`.\n'
-            f'Passing `trust_remote_code=True` will be mandatory '
-            f'to load this dataset from the next major release of `datasets`.',
-            FutureWarning,
-        )
-    # get script and other files
-    # local_path = self.download_loading_script()
-    # dataset_infos_path = self.download_dataset_infos_file()
-    # dataset_readme_path = self.download_dataset_readme_file()
 
     _api = HubApi()
     _dataset_name: str = self.name.split('/')[-1]
@@ -1260,8 +1268,9 @@ class DatasetsWrapperHF:
                                 path,
                                 download_config=download_config,
                                 revision=dataset_info.sha).get_module()
-                        except _datasets_server.DatasetsServerError:
-                            pass
+                        except Exception as e:
+                            logger.error(e)
+
                     # Otherwise we must use the dataset script if the user trusts it
                     return HubDatasetModuleFactoryWithScript(
                         path,
@@ -1314,7 +1323,11 @@ class DatasetsWrapperHF:
 def load_dataset_with_ctx(*args, **kwargs):
     hf_endpoint_origin = config.HF_ENDPOINT
     get_from_cache_origin = file_utils.get_from_cache
-    _download_origin = DownloadManager._download
+
+    # Compatible with datasets 2.18.0
+    _download_origin = DownloadManager._download if hasattr(DownloadManager, '_download') \
+        else DownloadManager._download_single
+
     dataset_info_origin = HfApi.dataset_info
     list_repo_tree_origin = HfApi.list_repo_tree
     get_paths_info_origin = HfApi.get_paths_info
@@ -1324,7 +1337,13 @@ def load_dataset_with_ctx(*args, **kwargs):
 
     config.HF_ENDPOINT = get_endpoint()
     file_utils.get_from_cache = get_from_cache_ms
-    DownloadManager._download = _download_ms
+
+    # Compatible with datasets 2.18.0
+    if hasattr(DownloadManager, '_download'):
+        DownloadManager._download = _download_ms
+    else:
+        DownloadManager._download_single = _download_ms
+
     HfApi.dataset_info = _dataset_info
     HfApi.list_repo_tree = _list_repo_tree
     HfApi.get_paths_info = _get_paths_info
@@ -1338,12 +1357,16 @@ def load_dataset_with_ctx(*args, **kwargs):
     finally:
         config.HF_ENDPOINT = hf_endpoint_origin
         file_utils.get_from_cache = get_from_cache_origin
-        DownloadManager._download = _download_origin
+
+        # Compatible with datasets 2.18.0
+        if hasattr(DownloadManager, '_download'):
+            DownloadManager._download = _download_origin
+        else:
+            DownloadManager._download_single = _download_origin
+
         HfApi.dataset_info = dataset_info_origin
         HfApi.list_repo_tree = list_repo_tree_origin
         HfApi.get_paths_info = get_paths_info_origin
         data_files.resolve_pattern = resolve_pattern_origin
         HubDatasetModuleFactoryWithoutScript.get_module = get_module_without_script_origin
         HubDatasetModuleFactoryWithScript.get_module = get_module_with_script_origin
-
-        logger.info('Context manager of ms-dataset exited.')
diff --git a/modelscope/msdatasets/utils/hf_file_utils.py b/modelscope/msdatasets/utils/hf_file_utils.py
index f1a4f1f7..b2931f7e 100644
--- a/modelscope/msdatasets/utils/hf_file_utils.py
+++ b/modelscope/msdatasets/utils/hf_file_utils.py
@@ -7,6 +7,7 @@ import os
 import re
 import shutil
 import warnings
+import inspect
 from contextlib import contextmanager
 from functools import partial
 from pathlib import Path
@@ -41,6 +42,7 @@ def get_from_cache_ms(
     ignore_url_params=False,
     storage_options=None,
     download_desc=None,
+    disable_tqdm=False,
 ) -> str:
     """
     Given a URL, look for the corresponding file in the local cache.
@@ -209,18 +211,42 @@ def get_from_cache_ms(
             if scheme == 'ftp':
                 ftp_get(url, temp_file)
             elif scheme not in ('http', 'https'):
-                fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc)
+                fsspec_get_sig = inspect.signature(fsspec_get)
+                if 'disable_tqdm' in fsspec_get_sig.parameters:
+                    fsspec_get(url,
+                               temp_file,
+                               storage_options=storage_options,
+                               desc=download_desc,
+                               disable_tqdm=disable_tqdm
+                               )
+                else:
+                    fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc)
             else:
-                http_get(
-                    url,
-                    temp_file=temp_file,
-                    proxies=proxies,
-                    resume_size=resume_size,
-                    headers=headers,
-                    cookies=cookies,
-                    max_retries=max_retries,
-                    desc=download_desc,
-                )
+                http_get_sig = inspect.signature(http_get)
+
+                if 'disable_tqdm' in http_get_sig.parameters:
+                    http_get(
+                        url,
+                        temp_file=temp_file,
+                        proxies=proxies,
+                        resume_size=resume_size,
+                        headers=headers,
+                        cookies=cookies,
+                        max_retries=max_retries,
+                        desc=download_desc,
+                        disable_tqdm=disable_tqdm,
+                    )
+                else:
+                    http_get(
+                        url,
+                        temp_file=temp_file,
+                        proxies=proxies,
+                        resume_size=resume_size,
+                        headers=headers,
+                        cookies=cookies,
+                        max_retries=max_retries,
+                        desc=download_desc,
+                    )
 
         logger.info(f'storing {url} in cache at {cache_path}')
         shutil.move(temp_file.name, cache_path)
diff --git a/requirements/datasets.txt b/requirements/datasets.txt
index 35924919..6ca2d853 100644
--- a/requirements/datasets.txt
+++ b/requirements/datasets.txt
@@ -1,8 +1,9 @@
 addict
 attrs
-datasets>=2.16.0,<2.19.0
+datasets>=2.18.0
 einops
 oss2
+Pillow
 python-dateutil>=2.1
 scipy
 # latest version has some compatible issue.
diff --git a/requirements/framework.txt b/requirements/framework.txt
index c8a4c277..6ca2d853 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,6 +1,6 @@
 addict
 attrs
-datasets>=2.16.0,<2.19.0
+datasets>=2.18.0
 einops
 oss2
 Pillow
diff --git a/tests/msdatasets/test_general_datasets.py b/tests/msdatasets/test_general_datasets.py
index 21ba3f2b..d32daeb9 100644
--- a/tests/msdatasets/test_general_datasets.py
+++ b/tests/msdatasets/test_general_datasets.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os
 import unittest
 
 from modelscope import MsDataset
@@ -7,9 +8,6 @@ from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
-# Note: MODELSCOPE_DOMAIN is set to 'test.modelscope.cn' in the environment variable
-# TODO: ONLY FOR TEST ENVIRONMENT, to be replaced by the online domain
-
 TEST_INNER_LEVEL = 1
 
 
@@ -19,32 +17,33 @@ class GeneralMsDatasetTest(unittest.TestCase):
                          'skip test in current test level')
     def test_return_dataset_info_only(self):
         ds = MsDataset.load(
-            'wangxingjun778test/aya_dataset_mini', dataset_info_only=True)
-        print(f'>>output of test_return_dataset_info_only:\n {ds}')
+            'wangxingjun778/aya_dataset_mini', dataset_info_only=True)
+        logger.info(f'>>output of test_return_dataset_info_only:\n {ds}')
 
     @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
                          'skip test in current test level')
     def test_inner_fashion_mnist(self):
         # inner means the dataset is on the test.modelscope.cn environment
         ds = MsDataset.load(
-            'xxxxtest0004/ms_test_0308_py',
+            'wangxingjun778/ms_test_0308_py',
             subset_name='fashion_mnist',
             split='train')
-        print(f'>>output of test_inner_fashion_mnist:\n {next(iter(ds))}')
+        logger.info(
+            f'>>output of test_inner_fashion_mnist:\n {next(iter(ds))}')
 
     @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
                          'skip test in current test level')
     def test_inner_clue(self):
         ds = MsDataset.load(
-            'wangxingjun778test/clue', subset_name='afqmc', split='train')
-        print(f'>>output of test_inner_clue:\n {next(iter(ds))}')
+            'wangxingjun778/clue', subset_name='afqmc', split='train')
+        logger.info(f'>>output of test_inner_clue:\n {next(iter(ds))}')
 
     @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
                          'skip test in current test level')
     def test_inner_cats_and_dogs_mini(self):
-        ds = MsDataset.load(
-            'wangxingjun778test/cats_and_dogs_mini', split='train')
-        print(f'>>output of test_inner_cats_and_dogs_mini:\n {next(iter(ds))}')
+        ds = MsDataset.load('wangxingjun778/cats_and_dogs_mini', split='train')
+        logger.info(
+            f'>>output of test_inner_cats_and_dogs_mini:\n {next(iter(ds))}')
 
     @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
                          'skip test in current test level')
@@ -53,14 +52,14 @@ class GeneralMsDatasetTest(unittest.TestCase):
         # data/train-xxx-of-xxx.parquet; data/test-xxx-of-xxx.parquet
         # demographics/train-xxx-of-xxx.parquet
 
-        ds = MsDataset.load(
-            'wangxingjun778test/aya_dataset_mini', split='train')
-        print(f'>>output of test_inner_aya_dataset_mini:\n {next(iter(ds))}')
+        ds = MsDataset.load('wangxingjun778/aya_dataset_mini', split='train')
+        logger.info(
+            f'>>output of test_inner_aya_dataset_mini:\n {next(iter(ds))}')
 
         ds = MsDataset.load(
-            'wangxingjun778test/aya_dataset_mini', subset_name='demographics')
+            'wangxingjun778/aya_dataset_mini', subset_name='demographics')
         assert next(iter(ds['train']))
-        print(
+        logger.info(
             f">>output of test_inner_aya_dataset_mini:\n {next(iter(ds['train']))}"
         )
 
@@ -68,36 +67,46 @@ class GeneralMsDatasetTest(unittest.TestCase):
                          'skip test in current test level')
     def test_inner_no_standard_imgs(self):
         infos = MsDataset.load(
-            'xxxxtest0004/png_jpg_txt_test', dataset_info_only=True)
+            'wangxingjun778/png_jpg_txt_test', dataset_info_only=True)
         assert infos['default']
 
-        ds = MsDataset.load('xxxxtest0004/png_jpg_txt_test', split='train')
-        print(f'>>>output of test_inner_no_standard_imgs: \n{next(iter(ds))}')
-        assert next(iter(ds))
-
-    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
-                         'skip test in current test level')
-    def test_inner_hf_pictures(self):
-        ds = MsDataset.load('xxxxtest0004/hf_Pictures')
-        print(ds)
+        ds = MsDataset.load('wangxingjun778/png_jpg_txt_test', split='train')
+        logger.info(
+            f'>>>output of test_inner_no_standard_imgs: \n{next(iter(ds))}')
         assert next(iter(ds))
 
     @unittest.skipUnless(test_level() >= 3, 'skip test in current test level')
     def test_inner_speech_yinpin(self):
-        ds = MsDataset.load('xxxxtest0004/hf_lj_speech_yinpin_test')
-        print(ds)
+        ds = MsDataset.load('wangxingjun778/hf_lj_speech_yinpin_test')
+        logger.info(ds)
         assert next(iter(ds))
 
     @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
                          'skip test in current test level')
     def test_inner_yuancheng_picture(self):
         ds = MsDataset.load(
-            'xxxxtest0004/yuancheng_picture',
+            'wangxingjun778/yuancheng_picture',
             subset_name='remote_images',
             split='train')
-        print(next(iter(ds)))
+        logger.info(next(iter(ds)))
         assert next(iter(ds))
 
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_youku_mplug_dataset(self):
+        # To test the Youku-AliceMind dataset with new sdk version
+        ds = MsDataset.load(
+            'modelscope/Youku-AliceMind',
+            subset_name='classification',
+            split='validation',  # Options: train, test, validation
+            use_streaming=True)
+
+        logger.info(next(iter(ds)))
+        data_sample = next(iter(ds))
+
+        assert data_sample['video_id'][0]
+        assert os.path.exists(data_sample['video_id:FILE'][0])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 83ffd3f8..dfbdfa8c 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -169,17 +169,6 @@ class MsDatasetTest(unittest.TestCase):
             'speech_asr_aishell1_trainsets', namespace='speech_asr')
         print(next(iter(ms_ds_asr['train'])))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    @require_torch
-    def test_to_torch_dataset_img(self):
-        ms_image_train = MsDataset.load(
-            'fixtures_image_utils', namespace='damotest', split='test')
-        pt_dataset = ms_image_train.to_torch_dataset(
-            preprocessors=ImgPreprocessor(image_path='file'))
-        import torch
-        dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
-        print(next(iter(dataloader)))
-
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     @require_tf
     def test_to_tf_dataset_img(self):
@@ -229,7 +218,7 @@ class MsDatasetTest(unittest.TestCase):
         print(data_example)
         assert data_example.values()
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 3, 'skip test in current test level')
     def test_streaming_load_img_object(self):
         """Test case for iterating PIL object."""
         from PIL.PngImagePlugin import PngImageFile
@@ -238,7 +227,7 @@ class MsDatasetTest(unittest.TestCase):
             subset_name='default',
             namespace='huizheng',
             split='train',
-            use_streaming=True)
+            use_streaming=False)
         data_example = next(iter(dataset))
         print(data_example)
         assert data_example.values()
@@ -247,7 +236,8 @@ class MsDatasetTest(unittest.TestCase):
     def test_to_ms_dataset(self):
         """Test case for converting huggingface dataset to `MsDataset` instance."""
         from datasets.load import load_dataset
-        hf_dataset = load_dataset('beans', split='train', streaming=True)
+        hf_dataset = load_dataset(
+            'AI-Lab-Makerere/beans', split='train', streaming=True)
         ms_dataset = MsDataset.to_ms_dataset(hf_dataset)
         data_example = next(iter(ms_dataset))
         print(data_example)
diff --git a/tests/pipelines/test_human3d_animation.py b/tests/pipelines/test_human3d_animation.py
index 97ee12f4..4236d076 100644
--- a/tests/pipelines/test_human3d_animation.py
+++ b/tests/pipelines/test_human3d_animation.py
@@ -17,7 +17,7 @@ class Human3DAnimationTest(unittest.TestCase):
         human3d = pipeline(self.task, model=self.model_id)
         input = {
             'dataset_id': 'damo/3DHuman_synthetic_dataset',
-            'case_id': '3f2a7538253e42a8',
+            'case_id': '000146',  # 3f2a7538253e42a8
             'action_dataset': 'damo/3DHuman_action_dataset',
             'action': 'SwingDancing',
             'save_dir': 'outputs',