From e02a260c93d9044bc93300541a76cf3384d9b2a9 Mon Sep 17 00:00:00 2001 From: "xingjun.wxj" Date: Fri, 10 Mar 2023 09:03:32 +0800 Subject: [PATCH] Refactor the task_datasets module Refactor the task_datasets module: 1. Add new module modelscope.msdatasets.dataset_cls.custom_datasets. 2. Add new function: modelscope.msdatasets.ms_dataset.MsDataset.to_custom_dataset(). 2. Add calling to_custom_dataset() func in MsDataset.load() to adapt new custom_datasets module. 3. Refactor the pipeline for loading custom dataset: 1) Only use MsDataset.load() function to load the custom datasets. 2) Combine MsDataset.load() with class EpochBasedTrainer. 4. Add new entry func for building datasets in EpochBasedTrainer: see modelscope.trainers.trainer.EpochBasedTrainer.build_dataset() 5. Add new func to build the custom dataset from model configuration, see: modelscope.trainers.trainer.EpochBasedTrainer.build_dataset_from_cfg() 6. Add new registry function for building custom datasets, see: modelscope.msdatasets.dataset_cls.custom_datasets.builder.build_custom_dataset() 7. Refine the class SiameseUIETrainer to adapt the new custom_datasets module. 8. Add class TorchCustomDataset as a superclass for custom datasets classes. 9. To move modules/classes/functions: 1) Move module msdatasets.audio to custom_datasets 2) Move module msdatasets.cv to custom_datasets 3) Move module bad_image_detecting to custom_datasets 4) Move module damoyolo to custom_datasets 5) Move module face_2d_keypoints to custom_datasets 6) Move module hand_2d_keypoints to custom_datasets 7) Move module human_wholebody_keypoint to custom_datasets 8) Move module image_classification to custom_datasets 9) Move module image_inpainting to custom_datasets 10) Move module image_portrait_enhancement to custom_datasets 11) Move module image_quality_assessment_degradation to custom_datasets 12) Move module image_quality_assmessment_mos to custom_datasets 13) Move class LanguageGuidedVideoSummarizationDataset to custom_datasets 14) Move class MGeoRankingDataset to custom_datasets 15) Move module movie_scene_segmentation custom_datasets 16) Move module object_detection to custom_datasets 17) Move module referring_video_object_segmentation to custom_datasets 18) Move module sidd_image_denoising to custom_datasets 19) Move module video_frame_interpolation to custom_datasets 20) Move module video_stabilization to custom_datasets 21) Move module video_super_resolution to custom_datasets 22) Move class GoproImageDeblurringDataset to custom_datasets 23) Move class EasyCVBaseDataset to custom_datasets 24) Move class ImageInstanceSegmentationCocoDataset to custom_datasets 25) Move class RedsImageDeblurringDataset to custom_datasets 26) Move class TextRankingDataset to custom_datasets 27) Move class VecoDataset to custom_datasets 28) Move class VideoSummarizationDataset to custom_datasets 10. To delete modules/functions/classes: 1) Del module task_datasets 2) Del to_task_dataset() in EpochBasedTrainer 3) Del build_dataset() in EpochBasedTrainer and renew a same name function. 11. Rename class Datasets to CustomDatasets in metainfo.py Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11872747 --- docs/source/api/modelscope.msdatasets.cv.rst | 14 - ...msdatasets.dataset_cls.custom_datasets.rst | 41 ++ .../api/modelscope.msdatasets.dataset_cls.rst | 15 + .../api/modelscope.msdatasets.ms_dataset.rst | 1 - modelscope/metainfo.py | 2 +- .../damo/apis/detector_evaluater.py | 4 +- .../damo/apis/detector_inference.py | 2 +- modelscope/msdatasets/__init__.py | 1 - modelscope/msdatasets/audio/__init__.py | 0 modelscope/msdatasets/cv/__init__.py | 3 - .../msdatasets/data_loader/data_loader.py | 7 +- modelscope/msdatasets/dataset_cls/__init__.py | 2 + .../dataset_cls/custom_datasets/__init__.py | 84 +++ .../custom_datasets}/audio/__init__.py | 0 .../custom_datasets}/audio/asr_dataset.py | 0 .../audio/kws_farfield_dataset.py | 1 - .../audio/kws_nearfield_dataset.py | 2 +- .../audio/kws_nearfield_processor.py | 0 .../bad_image_detecting/__init__.py | 0 .../bad_image_detecting_dataset.py | 12 +- .../custom_datasets}/builder.py | 12 +- .../custom_datasets}/damoyolo/__init__.py | 1 + .../custom_datasets}/damoyolo/build.py | 0 .../damoyolo/collate_batch.py | 0 .../damoyolo/datasets/__init__.py | 0 .../damoyolo/datasets/coco.py | 0 .../damoyolo/datasets/mosaic_wrapper.py | 0 .../damoyolo/evaluation/__init__.py | 2 +- .../damoyolo/evaluation/coco/__init__.py | 0 .../damoyolo/evaluation/coco/coco_eval.py | 0 .../damoyolo/samplers/__init__.py | 0 .../damoyolo/samplers/distributed.py | 0 .../samplers/grouped_batch_sampler.py | 0 .../samplers/iteration_based_batch_sampler.py | 0 .../damoyolo/transforms/__init__.py | 0 .../damoyolo/transforms/build.py | 0 .../damoyolo/transforms/transforms.py | 0 .../custom_datasets}/easycv_base.py | 0 .../face_2d_keypoins/__init__.py | 0 .../face_2d_keypoints_dataset.py | 11 +- .../gopro_image_deblurring_dataset.py | 17 +- .../hand_2d_keypoints/__init__.py | 22 + .../hand_2d_keypoints_dataset.py | 11 +- .../human_wholebody_keypoint/__init__.py | 0 .../human_wholebody_keypoint_dataset.py | 11 +- .../image_classification/__init__.py | 0 .../classification_dataset.py | 12 +- .../image_inpainting}/__init__.py | 6 +- .../custom_datasets}/image_inpainting/aug.py | 0 .../image_inpainting_dataset.py | 12 +- ...mage_instance_segmentation_coco_dataset.py | 8 +- .../image_portrait_enhancement/__init__.py | 0 .../image_portrait_enhancement/data_utils.py | 0 .../image_portrait_enhancement_dataset.py | 13 +- .../__init__.py | 0 ..._quality_assessment_degradation_dataset.py | 11 +- .../image_quality_assmessment_mos/__init__.py | 0 .../image_quality_assessment_mos_dataset.py | 12 +- .../image_semantic_segmentation/__init__.py | 0 .../segmentation_dataset.py | 11 +- ...uage_guided_video_summarization_dataset.py | 9 +- .../custom_datasets}/mgeo_ranking_dataset.py | 14 +- .../movie_scene_segmentation/__init__.py | 20 + .../movie_scene_segmentation_dataset.py | 11 +- .../movie_scene_segmentation/sampler.py | 0 .../object_detection/__init__.py | 0 .../object_detection/detection_dataset.py | 25 +- .../ocr_detection/__init__.py | 1 + .../ocr_detection/augmenter.py | 0 .../ocr_detection/data_loader.py | 0 .../ocr_detection/image_dataset.py | 0 .../ocr_detection/measures/__init__.py | 0 .../ocr_detection/measures/iou_evaluator.py | 0 .../ocr_detection/measures/quad_measurer.py | 0 .../ocr_detection/processes/__init__.py | 0 .../ocr_detection/processes/augment_data.py | 0 .../ocr_detection/processes/data_process.py | 0 .../processes/make_border_map.py | 0 .../processes/make_icdar_data.py | 0 .../processes/make_seg_detection_data.py | 0 .../processes/normalize_image.py | 0 .../processes/random_crop_data.py | 0 .../ocr_recognition_dataset.py | 11 +- .../reds_image_deblurring_dataset.py | 17 +- .../__init__.py | 21 + ...rring_video_object_segmentation_dataset.py | 9 +- .../transformers.py | 0 .../sidd_image_denoising/__init__.py | 0 .../sidd_image_denoising/data_utils.py | 0 .../sidd_image_denoising_dataset.py | 9 +- .../sidd_image_denoising/transforms.py | 0 .../custom_datasets}/text_ranking_dataset.py | 16 +- .../custom_datasets/torch_custom_dataset.py | 51 ++ .../custom_datasets}/veco_dataset.py | 8 +- .../video_frame_interpolation/__init__.py | 0 .../video_frame_interpolation/data_utils.py | 0 .../video_frame_interpolation_dataset.py | 13 +- .../video_stabilization/__init__.py | 0 .../video_stabilization_dataset.py | 9 +- .../video_summarization_dataset.py | 6 +- .../video_super_resolution/__init__.py | 0 .../video_super_resolution_dataset.py | 9 +- modelscope/msdatasets/dataset_cls/dataset.py | 27 +- .../msdatasets/meta/data_meta_config.py | 31 +- .../msdatasets/meta/data_meta_manager.py | 5 +- modelscope/msdatasets/ms_dataset.py | 697 ++++++++++-------- .../msdatasets/task_datasets/__init__.py | 51 -- modelscope/msdatasets/task_datasets/base.py | 48 -- .../image_inpainting/__init__.py | 2 - .../movie_scene_segmentation/__init__.py | 2 - .../__init__.py | 3 - .../task_datasets/torch_base_dataset.py | 64 -- modelscope/msdatasets/utils/dataset_utils.py | 4 +- .../trainers/audio/kws_farfield_trainer.py | 3 +- .../trainers/audio/kws_nearfield_trainer.py | 16 +- .../cv/image_detection_damoyolo_trainer.py | 8 +- .../trainers/cv/ocr_detection_db_trainer.py | 6 +- .../trainers/nlp/siamese_uie_trainer.py | 30 +- modelscope/trainers/nlp_trainer.py | 9 +- modelscope/trainers/trainer.py | 221 +++--- modelscope/utils/ast_utils.py | 2 +- modelscope/utils/constant.py | 5 + tests/msdatasets/test_ms_dataset.py | 37 +- .../test_movie_scene_segmentation.py | 90 ++- tests/run_analysis.py | 2 +- tests/taskdataset/test_veco_dataset.py | 3 +- .../trainers/test_action_detection_trainer.py | 2 +- tests/trainers/test_image_deblur_trainer.py | 2 +- tests/trainers/test_image_denoise_trainer.py | 2 +- ...est_image_instance_segmentation_trainer.py | 2 - ...test_image_portrait_enhancement_trainer.py | 8 +- ...uage_guided_video_summarization_trainer.py | 2 +- tests/trainers/test_siamese_uie_trainer.py | 3 +- .../trainers/test_tinynas_damoyolo_trainer.py | 12 +- .../test_video_summarization_trainer.py | 4 +- 135 files changed, 1158 insertions(+), 867 deletions(-) delete mode 100644 docs/source/api/modelscope.msdatasets.cv.rst create mode 100644 docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst create mode 100644 docs/source/api/modelscope.msdatasets.dataset_cls.rst delete mode 100644 modelscope/msdatasets/audio/__init__.py delete mode 100644 modelscope/msdatasets/cv/__init__.py create mode 100644 modelscope/msdatasets/dataset_cls/custom_datasets/__init__.py rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/audio/__init__.py (100%) rename modelscope/msdatasets/{ => dataset_cls/custom_datasets}/audio/asr_dataset.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/audio/kws_farfield_dataset.py (99%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/audio/kws_nearfield_dataset.py (98%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/audio/kws_nearfield_processor.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/bad_image_detecting/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/bad_image_detecting/bad_image_detecting_dataset.py (79%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/builder.py (56%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/__init__.py (75%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/build.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/collate_batch.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/datasets/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/datasets/coco.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/datasets/mosaic_wrapper.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/evaluation/__init__.py (93%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/evaluation/coco/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/evaluation/coco/coco_eval.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/samplers/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/samplers/distributed.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/samplers/grouped_batch_sampler.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/samplers/iteration_based_batch_sampler.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/transforms/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/transforms/build.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/damoyolo/transforms/transforms.py (100%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/easycv_base.py (100%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/face_2d_keypoins/__init__.py (100%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/face_2d_keypoins/face_2d_keypoints_dataset.py (78%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/gopro_image_deblurring_dataset.py (76%) create mode 100644 modelscope/msdatasets/dataset_cls/custom_datasets/hand_2d_keypoints/__init__.py rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/hand_2d_keypoints/hand_2d_keypoints_dataset.py (79%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/human_wholebody_keypoint/__init__.py (100%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py (79%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/image_classification/__init__.py (100%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/image_classification/classification_dataset.py (75%) rename modelscope/msdatasets/{cv/hand_2d_keypoints => dataset_cls/custom_datasets/image_inpainting}/__init__.py (75%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_inpainting/aug.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_inpainting/image_inpainting_dataset.py (97%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_instance_segmentation_coco_dataset.py (98%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_portrait_enhancement/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_portrait_enhancement/data_utils.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_portrait_enhancement/image_portrait_enhancement_dataset.py (77%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_quality_assessment_degradation/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py (81%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_quality_assmessment_mos/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py (77%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/image_semantic_segmentation/__init__.py (100%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/image_semantic_segmentation/segmentation_dataset.py (81%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/language_guided_video_summarization_dataset.py (94%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/mgeo_ranking_dataset.py (93%) create mode 100644 modelscope/msdatasets/dataset_cls/custom_datasets/movie_scene_segmentation/__init__.py rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/movie_scene_segmentation/movie_scene_segmentation_dataset.py (94%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/movie_scene_segmentation/sampler.py (100%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/object_detection/__init__.py (100%) rename modelscope/msdatasets/{cv => dataset_cls/custom_datasets}/object_detection/detection_dataset.py (85%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/__init__.py (78%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/augmenter.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/data_loader.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/image_dataset.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/measures/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/measures/iou_evaluator.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/measures/quad_measurer.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/processes/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/processes/augment_data.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/processes/data_process.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/processes/make_border_map.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/processes/make_icdar_data.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/processes/make_seg_detection_data.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/processes/normalize_image.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_detection/processes/random_crop_data.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/ocr_recognition_dataset.py (87%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/reds_image_deblurring_dataset.py (74%) create mode 100644 modelscope/msdatasets/dataset_cls/custom_datasets/referring_video_object_segmentation/__init__.py rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py (98%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/referring_video_object_segmentation/transformers.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/sidd_image_denoising/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/sidd_image_denoising/data_utils.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/sidd_image_denoising/sidd_image_denoising_dataset.py (87%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/sidd_image_denoising/transforms.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/text_ranking_dataset.py (92%) create mode 100644 modelscope/msdatasets/dataset_cls/custom_datasets/torch_custom_dataset.py rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/veco_dataset.py (91%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/video_frame_interpolation/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/video_frame_interpolation/data_utils.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/video_frame_interpolation/video_frame_interpolation_dataset.py (79%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/video_stabilization/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/video_stabilization/video_stabilization_dataset.py (71%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/video_summarization_dataset.py (94%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/video_super_resolution/__init__.py (100%) rename modelscope/msdatasets/{task_datasets => dataset_cls/custom_datasets}/video_super_resolution/video_super_resolution_dataset.py (89%) delete mode 100644 modelscope/msdatasets/task_datasets/__init__.py delete mode 100644 modelscope/msdatasets/task_datasets/base.py delete mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/__init__.py delete mode 100644 modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py delete mode 100644 modelscope/msdatasets/task_datasets/referring_video_object_segmentation/__init__.py delete mode 100644 modelscope/msdatasets/task_datasets/torch_base_dataset.py diff --git a/docs/source/api/modelscope.msdatasets.cv.rst b/docs/source/api/modelscope.msdatasets.cv.rst deleted file mode 100644 index ef0a8a3b..00000000 --- a/docs/source/api/modelscope.msdatasets.cv.rst +++ /dev/null @@ -1,14 +0,0 @@ -modelscope.msdatasets.cv -================================ - -.. automodule:: modelscope.msdatasets.cv - -.. currentmodule:: modelscope.msdatasets.cv - -.. autosummary:: - :toctree: generated - :nosignatures: - :template: classtemplate.rst - - easycv_base.EasyCVBaseDataset - image_classification.ClsDataset diff --git a/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst b/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst new file mode 100644 index 00000000..b5a4b0f6 --- /dev/null +++ b/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst @@ -0,0 +1,41 @@ +modelscope.msdatasets.dataset_cls.custom_datasets +==================== + +.. automodule:: modelscope.msdatasets.dataset_cls.custom_datasets + +.. currentmodule:: modelscope.msdatasets.dataset_cls.custom_datasets + + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: classtemplate.rst + + EasyCVBaseDataset + TorchCustomDataset + MovieSceneSegmentationDataset + ImageInstanceSegmentationCocoDataset + GoproImageDeblurringDataset + LanguageGuidedVideoSummarizationDataset + MGeoRankingDataset + RedsImageDeblurringDataset + TextRankingDataset + VecoDataset + VideoSummarizationDataset + BadImageDetectingDataset + ImageInpaintingDataset + ImagePortraitEnhancementDataset + ImageQualityAssessmentDegradationDataset + ImageQualityAssessmentMosDataset + ReferringVideoObjectSegmentationDataset + SiddImageDenoisingDataset + VideoFrameInterpolationDataset + VideoStabilizationDataset + VideoSuperResolutionDataset + SegDataset + FaceKeypointDataset + HandCocoWholeBodyDataset + WholeBodyCocoTopDownDataset + ClsDataset + DetImagesMixDataset + DetDataset diff --git a/docs/source/api/modelscope.msdatasets.dataset_cls.rst b/docs/source/api/modelscope.msdatasets.dataset_cls.rst new file mode 100644 index 00000000..d415b800 --- /dev/null +++ b/docs/source/api/modelscope.msdatasets.dataset_cls.rst @@ -0,0 +1,15 @@ +modelscope.msdatasets.dataset_cls +==================== + +.. automodule:: modelscope.msdatasets.dataset_cls + +.. currentmodule:: modelscope.msdatasets.dataset_cls + + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: classtemplate.rst + + ExternalDataset + NativeIterableDataset diff --git a/docs/source/api/modelscope.msdatasets.ms_dataset.rst b/docs/source/api/modelscope.msdatasets.ms_dataset.rst index 03cc8d97..92df1e89 100644 --- a/docs/source/api/modelscope.msdatasets.ms_dataset.rst +++ b/docs/source/api/modelscope.msdatasets.ms_dataset.rst @@ -10,5 +10,4 @@ modelscope.msdatasets.ms_dataset :nosignatures: :template: classtemplate.rst - MsMapDataset MsDataset diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 9e7e368a..e4059269 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -1137,7 +1137,7 @@ class LR_Schedulers(object): ExponentialWarmup = 'ExponentialWarmup' -class Datasets(object): +class CustomDatasets(object): """ Names for different datasets. """ ClsDataset = 'ClsDataset' diff --git a/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py b/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py index 82ffb567..6ff194f6 100644 --- a/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py +++ b/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py @@ -8,8 +8,8 @@ from modelscope.models.cv.tinynas_detection.damo.apis.detector_inference import inference from modelscope.models.cv.tinynas_detection.damo.detectors.detector import \ build_local_model -from modelscope.msdatasets.task_datasets.damoyolo import (build_dataloader, - build_dataset) +from modelscope.msdatasets.dataset_cls.custom_datasets.damoyolo import ( + build_dataloader, build_dataset) def mkdir(path): diff --git a/modelscope/models/cv/tinynas_detection/damo/apis/detector_inference.py b/modelscope/models/cv/tinynas_detection/damo/apis/detector_inference.py index 47c1fb1b..dcd33834 100644 --- a/modelscope/models/cv/tinynas_detection/damo/apis/detector_inference.py +++ b/modelscope/models/cv/tinynas_detection/damo/apis/detector_inference.py @@ -5,7 +5,7 @@ import os import torch from tqdm import tqdm -from modelscope.msdatasets.task_datasets.damoyolo.evaluation import evaluate +from modelscope.msdatasets.dataset_cls.custom_datasets.damoyolo import evaluate from modelscope.utils.logger import get_logger from modelscope.utils.timer import Timer, get_time_str from modelscope.utils.torch_utils import (all_gather, get_world_size, diff --git a/modelscope/msdatasets/__init__.py b/modelscope/msdatasets/__init__.py index 073f9396..70200e44 100644 --- a/modelscope/msdatasets/__init__.py +++ b/modelscope/msdatasets/__init__.py @@ -1,3 +1,2 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from . import cv from .ms_dataset import MsDataset diff --git a/modelscope/msdatasets/audio/__init__.py b/modelscope/msdatasets/audio/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/msdatasets/cv/__init__.py b/modelscope/msdatasets/cv/__init__.py deleted file mode 100644 index fad91bcf..00000000 --- a/modelscope/msdatasets/cv/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from . import (image_classification, image_semantic_segmentation, - object_detection) diff --git a/modelscope/msdatasets/data_loader/data_loader.py b/modelscope/msdatasets/data_loader/data_loader.py index c97151b0..1ef92372 100644 --- a/modelscope/msdatasets/data_loader/data_loader.py +++ b/modelscope/msdatasets/data_loader/data_loader.py @@ -13,6 +13,7 @@ from modelscope.msdatasets.context.dataset_context_config import \ DatasetContextConfig from modelscope.msdatasets.data_files.data_files_manager import \ DataFilesManager +from modelscope.msdatasets.dataset_cls.dataset import ExternalDataset from modelscope.msdatasets.meta.data_meta_manager import DataMetaManager from modelscope.utils.constant import DatasetFormations @@ -62,7 +63,8 @@ class OssDataLoader(BaseDataLoader): self.data_files_builder: Optional[DataFilesManager] = None self.dataset: Optional[Union[Dataset, IterableDataset, DatasetDict, - IterableDatasetDict]] = None + IterableDatasetDict, + ExternalDataset]] = None self.builder: Optional[DatasetBuilder] = None self.data_files_manager: Optional[DataFilesManager] = None @@ -141,7 +143,8 @@ class OssDataLoader(BaseDataLoader): self.builder) def _post_process(self) -> None: - ... + if isinstance(self.dataset, ExternalDataset): + self.dataset.custom_map = self.dataset_context_config.data_meta_config.meta_type_map class MaxComputeDataLoader(BaseDataLoader): diff --git a/modelscope/msdatasets/dataset_cls/__init__.py b/modelscope/msdatasets/dataset_cls/__init__.py index b937315b..a5b2e73d 100644 --- a/modelscope/msdatasets/dataset_cls/__init__.py +++ b/modelscope/msdatasets/dataset_cls/__init__.py @@ -1 +1,3 @@ # Copyright (c) Alibaba, Inc. and its affiliates. + +from .dataset import ExternalDataset, NativeIterableDataset diff --git a/modelscope/msdatasets/dataset_cls/custom_datasets/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/__init__.py new file mode 100644 index 00000000..c8a94b89 --- /dev/null +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/__init__.py @@ -0,0 +1,84 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .easycv_base import EasyCVBaseDataset + from .builder import CUSTOM_DATASETS, build_custom_dataset + from .torch_custom_dataset import TorchCustomDataset + from .movie_scene_segmentation.movie_scene_segmentation_dataset import MovieSceneSegmentationDataset + from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset + from .gopro_image_deblurring_dataset import GoproImageDeblurringDataset + from .language_guided_video_summarization_dataset import LanguageGuidedVideoSummarizationDataset + from .mgeo_ranking_dataset import MGeoRankingDataset + from .reds_image_deblurring_dataset import RedsImageDeblurringDataset + from .text_ranking_dataset import TextRankingDataset + from .veco_dataset import VecoDataset + from .video_summarization_dataset import VideoSummarizationDataset + from .audio import KWSDataset, KWSDataLoader, kws_nearfield_dataset + from .bad_image_detecting import BadImageDetectingDataset + from .image_inpainting import ImageInpaintingDataset + from .image_portrait_enhancement import ImagePortraitEnhancementDataset + from .image_quality_assessment_degradation import ImageQualityAssessmentDegradationDataset + from .image_quality_assmessment_mos import ImageQualityAssessmentMosDataset + from .referring_video_object_segmentation import ReferringVideoObjectSegmentationDataset + from .sidd_image_denoising import SiddImageDenoisingDataset + from .video_frame_interpolation import VideoFrameInterpolationDataset + from .video_stabilization import VideoStabilizationDataset + from .video_super_resolution import VideoSuperResolutionDataset + from .image_semantic_segmentation import SegDataset + from .face_2d_keypoins import FaceKeypointDataset + from .hand_2d_keypoints import HandCocoWholeBodyDataset + from .human_wholebody_keypoint import WholeBodyCocoTopDownDataset + from .image_classification import ClsDataset + from .object_detection import DetDataset, DetImagesMixDataset + from .ocr_detection import DataLoader, ImageDataset, QuadMeasurer + from .ocr_recognition_dataset import OCRRecognitionDataset +else: + _import_structure = { + 'easycv_base': ['EasyCVBaseDataset'], + 'builder': ['CUSTOM_DATASETS', 'build_custom_dataset'], + 'torch_custom_dataset': ['TorchCustomDataset'], + 'movie_scene_segmentation_dataset': ['MovieSceneSegmentationDataset'], + 'image_instance_segmentation_coco_dataset': + ['ImageInstanceSegmentationCocoDataset'], + 'gopro_image_deblurring_dataset': ['GoproImageDeblurringDataset'], + 'language_guided_video_summarization_dataset': + ['LanguageGuidedVideoSummarizationDataset'], + 'mgeo_ranking_dataset': ['MGeoRankingDataset'], + 'reds_image_deblurring_dataset': ['RedsImageDeblurringDataset'], + 'text_ranking_dataset': ['TextRankingDataset'], + 'veco_dataset': ['VecoDataset'], + 'video_summarization_dataset': ['VideoSummarizationDataset'], + 'audio': ['KWSDataset', 'KWSDataLoader', 'kws_nearfield_dataset'], + 'bad_image_detecting': ['BadImageDetectingDataset'], + 'image_inpainting': ['ImageInpaintingDataset'], + 'image_portrait_enhancement': ['ImagePortraitEnhancementDataset'], + 'image_quality_assessment_degradation': + ['ImageQualityAssessmentDegradationDataset'], + 'image_quality_assmessment_mos': ['ImageQualityAssessmentMosDataset'], + 'referring_video_object_segmentation': + ['ReferringVideoObjectSegmentationDataset'], + 'sidd_image_denoising': ['SiddImageDenoisingDataset'], + 'video_frame_interpolation': ['VideoFrameInterpolationDataset'], + 'video_stabilization': ['VideoStabilizationDataset'], + 'video_super_resolution': ['VideoSuperResolutionDataset'], + 'image_semantic_segmentation': ['SegDataset'], + 'face_2d_keypoins': ['FaceKeypointDataset'], + 'hand_2d_keypoints': ['HandCocoWholeBodyDataset'], + 'human_wholebody_keypoint': ['WholeBodyCocoTopDownDataset'], + 'image_classification': ['ClsDataset'], + 'object_detection': ['DetDataset', 'DetImagesMixDataset'], + 'ocr_detection': ['DataLoader', 'ImageDataset', 'QuadMeasurer'], + 'ocr_recognition_dataset': ['OCRRecognitionDataset'], + } + + import sys + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/msdatasets/task_datasets/audio/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/audio/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/audio/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/audio/__init__.py diff --git a/modelscope/msdatasets/audio/asr_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/audio/asr_dataset.py similarity index 100% rename from modelscope/msdatasets/audio/asr_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/audio/asr_dataset.py diff --git a/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/audio/kws_farfield_dataset.py similarity index 99% rename from modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/audio/kws_farfield_dataset.py index d4866204..69c95bbd 100644 --- a/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/audio/kws_farfield_dataset.py @@ -5,7 +5,6 @@ import math import os.path import queue import threading -import time import numpy as np import torch diff --git a/modelscope/msdatasets/task_datasets/audio/kws_nearfield_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/audio/kws_nearfield_dataset.py similarity index 98% rename from modelscope/msdatasets/task_datasets/audio/kws_nearfield_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/audio/kws_nearfield_dataset.py index 43f28e01..1b784410 100644 --- a/modelscope/msdatasets/task_datasets/audio/kws_nearfield_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/audio/kws_nearfield_dataset.py @@ -18,7 +18,7 @@ import torch import torch.distributed as dist from torch.utils.data import IterableDataset -import modelscope.msdatasets.task_datasets.audio.kws_nearfield_processor as processor +import modelscope.msdatasets.dataset_cls.custom_datasets.audio.kws_nearfield_processor as processor from modelscope.trainers.audio.kws_utils.file_utils import (make_pair, read_lists) from modelscope.utils.logger import get_logger diff --git a/modelscope/msdatasets/task_datasets/audio/kws_nearfield_processor.py b/modelscope/msdatasets/dataset_cls/custom_datasets/audio/kws_nearfield_processor.py similarity index 100% rename from modelscope/msdatasets/task_datasets/audio/kws_nearfield_processor.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/audio/kws_nearfield_processor.py diff --git a/modelscope/msdatasets/task_datasets/bad_image_detecting/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/bad_image_detecting/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/bad_image_detecting/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/bad_image_detecting/__init__.py diff --git a/modelscope/msdatasets/task_datasets/bad_image_detecting/bad_image_detecting_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/bad_image_detecting/bad_image_detecting_dataset.py similarity index 79% rename from modelscope/msdatasets/task_datasets/bad_image_detecting/bad_image_detecting_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/bad_image_detecting/bad_image_detecting_dataset.py index f3cd9a2f..539b7b25 100644 --- a/modelscope/msdatasets/task_datasets/bad_image_detecting/bad_image_detecting_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/bad_image_detecting/bad_image_detecting_dataset.py @@ -1,12 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import cv2 -import numpy as np - from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.outputs import OutputKeys from modelscope.preprocessors import LoadImage from modelscope.preprocessors.cv.bad_image_detecting_preprocessor import \ @@ -14,9 +10,9 @@ from modelscope.preprocessors.cv.bad_image_detecting_preprocessor import \ from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.bad_image_detecting, module_name=Models.bad_image_detecting) -class BadImageDetectingDataset(TorchTaskDataset): +class BadImageDetectingDataset(TorchCustomDataset): """Paired image dataset for bad image detecting. """ diff --git a/modelscope/msdatasets/task_datasets/builder.py b/modelscope/msdatasets/dataset_cls/custom_datasets/builder.py similarity index 56% rename from modelscope/msdatasets/task_datasets/builder.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/builder.py index 683bec8f..a793ea27 100644 --- a/modelscope/msdatasets/task_datasets/builder.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/builder.py @@ -3,13 +3,13 @@ from modelscope.utils.config import ConfigDict from modelscope.utils.registry import Registry, build_from_cfg -TASK_DATASETS = Registry('task_datasets') +CUSTOM_DATASETS = Registry('custom_datasets') -def build_task_dataset(cfg: ConfigDict, - task_name: str = None, - default_args: dict = None): - """ Build task specific dataset processor given model config dict and the task name. +def build_custom_dataset(cfg: ConfigDict, + task_name: str, + default_args: dict = None): + """ Build custom dataset for user-define dataset given model config and task name. Args: cfg (:obj:`ConfigDict`): config dict for model object. @@ -18,4 +18,4 @@ def build_task_dataset(cfg: ConfigDict, default_args (dict, optional): Default initialization arguments. """ return build_from_cfg( - cfg, TASK_DATASETS, group_key=task_name, default_args=default_args) + cfg, CUSTOM_DATASETS, group_key=task_name, default_args=default_args) diff --git a/modelscope/msdatasets/task_datasets/damoyolo/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/__init__.py similarity index 75% rename from modelscope/msdatasets/task_datasets/damoyolo/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/__init__.py index 2a3bccdb..dabde7a4 100644 --- a/modelscope/msdatasets/task_datasets/damoyolo/__init__.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/__init__.py @@ -1,2 +1,3 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from .build import build_dataloader, build_dataset +from .evaluation import evaluate diff --git a/modelscope/msdatasets/task_datasets/damoyolo/build.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/build.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/build.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/build.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/collate_batch.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/collate_batch.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/collate_batch.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/collate_batch.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/datasets/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/datasets/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/datasets/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/datasets/__init__.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/datasets/coco.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/datasets/coco.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/datasets/coco.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/datasets/coco.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/datasets/mosaic_wrapper.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/datasets/mosaic_wrapper.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/datasets/mosaic_wrapper.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/datasets/mosaic_wrapper.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/evaluation/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/evaluation/__init__.py similarity index 93% rename from modelscope/msdatasets/task_datasets/damoyolo/evaluation/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/evaluation/__init__.py index b121b80b..b12fbf69 100644 --- a/modelscope/msdatasets/task_datasets/damoyolo/evaluation/__init__.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/evaluation/__init__.py @@ -1,6 +1,6 @@ # Copyright © Alibaba, Inc. and its affiliates. -from modelscope.msdatasets.task_datasets.damoyolo import datasets +from .. import datasets from .coco import coco_evaluation diff --git a/modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/evaluation/coco/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/evaluation/coco/__init__.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/coco_eval.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/evaluation/coco/coco_eval.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/coco_eval.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/evaluation/coco/coco_eval.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/samplers/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/samplers/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/samplers/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/samplers/__init__.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/samplers/distributed.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/samplers/distributed.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/samplers/distributed.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/samplers/distributed.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/samplers/grouped_batch_sampler.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/samplers/grouped_batch_sampler.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/samplers/grouped_batch_sampler.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/samplers/grouped_batch_sampler.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/samplers/iteration_based_batch_sampler.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/samplers/iteration_based_batch_sampler.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/samplers/iteration_based_batch_sampler.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/samplers/iteration_based_batch_sampler.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/transforms/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/transforms/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/transforms/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/transforms/__init__.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/transforms/build.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/transforms/build.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/transforms/build.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/transforms/build.py diff --git a/modelscope/msdatasets/task_datasets/damoyolo/transforms/transforms.py b/modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/transforms/transforms.py similarity index 100% rename from modelscope/msdatasets/task_datasets/damoyolo/transforms/transforms.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/damoyolo/transforms/transforms.py diff --git a/modelscope/msdatasets/cv/easycv_base.py b/modelscope/msdatasets/dataset_cls/custom_datasets/easycv_base.py similarity index 100% rename from modelscope/msdatasets/cv/easycv_base.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/easycv_base.py diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/face_2d_keypoins/__init__.py similarity index 100% rename from modelscope/msdatasets/cv/face_2d_keypoins/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/face_2d_keypoins/__init__.py diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/face_2d_keypoins/face_2d_keypoints_dataset.py similarity index 78% rename from modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/face_2d_keypoins/face_2d_keypoints_dataset.py index 2f2e03ef..9f55901f 100644 --- a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/face_2d_keypoins/face_2d_keypoints_dataset.py @@ -1,15 +1,16 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset -from modelscope.metainfo import Datasets -from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.metainfo import CustomDatasets +from modelscope.msdatasets.dataset_cls.custom_datasets import CUSTOM_DATASETS +from modelscope.msdatasets.dataset_cls.custom_datasets.easycv_base import \ + EasyCVBaseDataset from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( group_key=Tasks.face_2d_keypoints, - module_name=Datasets.Face2dKeypointsDataset) + module_name=CustomDatasets.Face2dKeypointsDataset) class FaceKeypointDataset(EasyCVBaseDataset, _FaceKeypointDataset): """EasyCV dataset for face 2d keypoints. diff --git a/modelscope/msdatasets/task_datasets/gopro_image_deblurring_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/gopro_image_deblurring_dataset.py similarity index 76% rename from modelscope/msdatasets/task_datasets/gopro_image_deblurring_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/gopro_image_deblurring_dataset.py index fb621551..408b8ffe 100644 --- a/modelscope/msdatasets/task_datasets/gopro_image_deblurring_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/gopro_image_deblurring_dataset.py @@ -3,14 +3,13 @@ import cv2 import numpy as np -from modelscope.metainfo import Datasets -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.sidd_image_denoising.data_utils import ( +from modelscope.metainfo import CustomDatasets +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) +from modelscope.msdatasets.dataset_cls.custom_datasets.sidd_image_denoising.data_utils import ( img2tensor, padding) -from modelscope.msdatasets.task_datasets.sidd_image_denoising.transforms import ( +from modelscope.msdatasets.dataset_cls.custom_datasets.sidd_image_denoising.transforms import ( augment, paired_random_crop) -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset from modelscope.utils.constant import Tasks @@ -18,9 +17,9 @@ def default_loader(path): return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0 -@TASK_DATASETS.register_module( - Tasks.image_deblurring, module_name=Datasets.PairedDataset) -class GoproImageDeblurringDataset(TorchTaskDataset): +@CUSTOM_DATASETS.register_module( + Tasks.image_deblurring, module_name=CustomDatasets.PairedDataset) +class GoproImageDeblurringDataset(TorchCustomDataset): """Paired image dataset for image restoration. """ diff --git a/modelscope/msdatasets/dataset_cls/custom_datasets/hand_2d_keypoints/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/hand_2d_keypoints/__init__.py new file mode 100644 index 00000000..3af670e3 --- /dev/null +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/hand_2d_keypoints/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .hand_2d_keypoints_dataset import HandCocoWholeBodyDataset + +else: + _import_structure = { + 'hand_2d_keypoints_dataset': ['HandCocoWholeBodyDataset'] + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/hand_2d_keypoints/hand_2d_keypoints_dataset.py similarity index 79% rename from modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/hand_2d_keypoints/hand_2d_keypoints_dataset.py index 89ee0bb8..c6163715 100644 --- a/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/hand_2d_keypoints/hand_2d_keypoints_dataset.py @@ -2,15 +2,16 @@ from easycv.datasets.pose import \ HandCocoWholeBodyDataset as _HandCocoWholeBodyDataset -from modelscope.metainfo import Datasets -from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.metainfo import CustomDatasets +from modelscope.msdatasets.dataset_cls.custom_datasets import CUSTOM_DATASETS +from modelscope.msdatasets.dataset_cls.custom_datasets.easycv_base import \ + EasyCVBaseDataset from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( group_key=Tasks.hand_2d_keypoints, - module_name=Datasets.HandCocoWholeBodyDataset) + module_name=CustomDatasets.HandCocoWholeBodyDataset) class HandCocoWholeBodyDataset(EasyCVBaseDataset, _HandCocoWholeBodyDataset): """EasyCV dataset for human hand 2d keypoints. diff --git a/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/human_wholebody_keypoint/__init__.py similarity index 100% rename from modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/human_wholebody_keypoint/__init__.py diff --git a/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py similarity index 79% rename from modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py index fc9469f2..59c97af8 100644 --- a/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py @@ -2,15 +2,16 @@ from easycv.datasets.pose import \ WholeBodyCocoTopDownDataset as _WholeBodyCocoTopDownDataset -from modelscope.metainfo import Datasets -from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.metainfo import CustomDatasets +from modelscope.msdatasets.dataset_cls.custom_datasets import CUSTOM_DATASETS +from modelscope.msdatasets.dataset_cls.custom_datasets.easycv_base import \ + EasyCVBaseDataset from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( group_key=Tasks.human_wholebody_keypoint, - module_name=Datasets.HumanWholeBodyKeypointDataset) + module_name=CustomDatasets.HumanWholeBodyKeypointDataset) class WholeBodyCocoTopDownDataset(EasyCVBaseDataset, _WholeBodyCocoTopDownDataset): """EasyCV dataset for human whole body 2d keypoints. diff --git a/modelscope/msdatasets/cv/image_classification/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_classification/__init__.py similarity index 100% rename from modelscope/msdatasets/cv/image_classification/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_classification/__init__.py diff --git a/modelscope/msdatasets/cv/image_classification/classification_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_classification/classification_dataset.py similarity index 75% rename from modelscope/msdatasets/cv/image_classification/classification_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_classification/classification_dataset.py index ba73e472..386810c7 100644 --- a/modelscope/msdatasets/cv/image_classification/classification_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/image_classification/classification_dataset.py @@ -1,14 +1,16 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from easycv.datasets.classification import ClsDataset as _ClsDataset -from modelscope.metainfo import Datasets -from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.metainfo import CustomDatasets +from modelscope.msdatasets.dataset_cls.custom_datasets import CUSTOM_DATASETS +from modelscope.msdatasets.dataset_cls.custom_datasets.easycv_base import \ + EasyCVBaseDataset from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( - group_key=Tasks.image_classification, module_name=Datasets.ClsDataset) +@CUSTOM_DATASETS.register_module( + group_key=Tasks.image_classification, + module_name=CustomDatasets.ClsDataset) class ClsDataset(_ClsDataset): """EasyCV dataset for classification. diff --git a/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_inpainting/__init__.py similarity index 75% rename from modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_inpainting/__init__.py index 5c1c72c1..0c9552bd 100644 --- a/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/image_inpainting/__init__.py @@ -4,13 +4,11 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .hand_2d_keypoints_dataset import Hand2DKeypointDataset - + from .image_inpainting_dataset import ImageInpaintingDataset else: _import_structure = { - 'hand_2d_keypoints_dataset': ['Hand2DKeypointDataset'] + 'image_inpainting_dataset': ['ImageInpaintingDataset'], } - import sys sys.modules[__name__] = LazyImportModule( diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/aug.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_inpainting/aug.py similarity index 100% rename from modelscope/msdatasets/task_datasets/image_inpainting/aug.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_inpainting/aug.py diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_inpainting/image_inpainting_dataset.py similarity index 97% rename from modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_inpainting/image_inpainting_dataset.py index 057b8f88..c7040c86 100644 --- a/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/image_inpainting/image_inpainting_dataset.py @@ -3,20 +3,16 @@ Part of the implementation is borrowed and modified from LaMa, publicly available at https://github.com/saic-mdal/lama """ import glob -import os import os.path as osp from enum import Enum import albumentations as A import cv2 -import json import numpy as np -import torch from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger from .aug import IAAAffine2, IAAPerspective2 @@ -296,9 +292,9 @@ def get_transforms(test_mode, out_size): return transform -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.image_inpainting, module_name=Models.image_inpainting) -class ImageInpaintingDataset(TorchTaskDataset): +class ImageInpaintingDataset(TorchCustomDataset): def __init__(self, **kwargs): split_config = kwargs['split_config'] diff --git a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_instance_segmentation_coco_dataset.py similarity index 98% rename from modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_instance_segmentation_coco_dataset.py index 1c7bc249..4dd1af5a 100644 --- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/image_instance_segmentation_coco_dataset.py @@ -6,9 +6,9 @@ import numpy as np from pycocotools.coco import COCO from modelscope.metainfo import Models +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import Tasks -from .builder import TASK_DATASETS -from .torch_base_dataset import TorchTaskDataset DATASET_STRUCTURE = { 'train': { @@ -22,10 +22,10 @@ DATASET_STRUCTURE = { } -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( module_name=Models.cascade_mask_rcnn_swin, group_key=Tasks.image_segmentation) -class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): +class ImageInstanceSegmentationCocoDataset(TorchCustomDataset): """Coco-style dataset for image instance segmentation. Args: diff --git a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_portrait_enhancement/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_portrait_enhancement/__init__.py diff --git a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_portrait_enhancement/data_utils.py similarity index 100% rename from modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_portrait_enhancement/data_utils.py diff --git a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py similarity index 77% rename from modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py index 58d40778..d2c03408 100644 --- a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py @@ -3,10 +3,9 @@ import cv2 import numpy as np -from modelscope.metainfo import Datasets, Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.metainfo import CustomDatasets +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import Tasks from .data_utils import img2tensor @@ -15,9 +14,9 @@ def default_loader(path): return cv2.imread(path, cv2.IMREAD_COLOR).astype(np.float32) / 255.0 -@TASK_DATASETS.register_module( - Tasks.image_portrait_enhancement, module_name=Datasets.PairedDataset) -class ImagePortraitEnhancementDataset(TorchTaskDataset): +@CUSTOM_DATASETS.register_module( + Tasks.image_portrait_enhancement, module_name=CustomDatasets.PairedDataset) +class ImagePortraitEnhancementDataset(TorchCustomDataset): """Paired image dataset for image portrait enhancement. """ diff --git a/modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assessment_degradation/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assessment_degradation/__init__.py diff --git a/modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py similarity index 81% rename from modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py index 75826065..06f0453e 100644 --- a/modelscope/msdatasets/task_datasets/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assessment_degradation/image_quality_assessment_degradation_dataset.py @@ -1,21 +1,18 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import cv2 -import numpy as np from torchvision import transforms from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.preprocessors import LoadImage from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.image_quality_assessment_degradation, module_name=Models.image_quality_assessment_degradation) -class ImageQualityAssessmentDegradationDataset(TorchTaskDataset): +class ImageQualityAssessmentDegradationDataset(TorchCustomDataset): """Paired image dataset for image quality assessment degradation. """ diff --git a/modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assmessment_mos/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assmessment_mos/__init__.py diff --git a/modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py similarity index 77% rename from modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py index 3d8ed297..28c163eb 100644 --- a/modelscope/msdatasets/task_datasets/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/image_quality_assmessment_mos/image_quality_assessment_mos_dataset.py @@ -1,20 +1,16 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import cv2 -import numpy as np - from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.preprocessors.cv import ImageQualityAssessmentMosPreprocessor from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.image_quality_assessment_mos, module_name=Models.image_quality_assessment_mos) -class ImageQualityAssessmentMosDataset(TorchTaskDataset): +class ImageQualityAssessmentMosDataset(TorchCustomDataset): """Paired image dataset for image quality assessment mos. """ diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_semantic_segmentation/__init__.py similarity index 100% rename from modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_semantic_segmentation/__init__.py diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/image_semantic_segmentation/segmentation_dataset.py similarity index 81% rename from modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/image_semantic_segmentation/segmentation_dataset.py index b1316e2e..71e7c42b 100644 --- a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/image_semantic_segmentation/segmentation_dataset.py @@ -1,14 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from easycv.datasets.segmentation import SegDataset as _SegDataset -from modelscope.metainfo import Datasets -from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.metainfo import CustomDatasets +from modelscope.msdatasets.dataset_cls.custom_datasets import CUSTOM_DATASETS +from modelscope.msdatasets.dataset_cls.custom_datasets.easycv_base import \ + EasyCVBaseDataset from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( - group_key=Tasks.image_segmentation, module_name=Datasets.SegDataset) +@CUSTOM_DATASETS.register_module( + group_key=Tasks.image_segmentation, module_name=CustomDatasets.SegDataset) class SegDataset(EasyCVBaseDataset, _SegDataset): """EasyCV dataset for Sementic segmentation. For more details, please refer to : diff --git a/modelscope/msdatasets/task_datasets/language_guided_video_summarization_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/language_guided_video_summarization_dataset.py similarity index 94% rename from modelscope/msdatasets/task_datasets/language_guided_video_summarization_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/language_guided_video_summarization_dataset.py index 94313e15..756d0050 100644 --- a/modelscope/msdatasets/task_datasets/language_guided_video_summarization_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/language_guided_video_summarization_dataset.py @@ -25,16 +25,15 @@ import numpy as np import torch from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.language_guided_video_summarization, module_name=Models.language_guided_video_summarization) -class LanguageGuidedVideoSummarizationDataset(TorchTaskDataset): +class LanguageGuidedVideoSummarizationDataset(TorchCustomDataset): def __init__(self, mode, opt, root_dir): self.mode = mode diff --git a/modelscope/msdatasets/task_datasets/mgeo_ranking_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/mgeo_ranking_dataset.py similarity index 93% rename from modelscope/msdatasets/task_datasets/mgeo_ranking_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/mgeo_ranking_dataset.py index 9adccd7c..536451ae 100644 --- a/modelscope/msdatasets/task_datasets/mgeo_ranking_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/mgeo_ranking_dataset.py @@ -1,24 +1,20 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import random -from dataclasses import dataclass -from typing import Any, Dict, List, Tuple, Union +from typing import Any, List, Union import json import torch -from datasets import Dataset, IterableDataset, concatenate_datasets from torch.utils.data import ConcatDataset -from transformers import DataCollatorWithPadding from modelscope.metainfo import Models +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import ModeKeys, Tasks -from .base import TaskDataset -from .builder import TASK_DATASETS -from .torch_base_dataset import TorchTaskDataset -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( group_key=Tasks.text_ranking, module_name=Models.mgeo) -class MGeoRankingDataset(TorchTaskDataset): +class MGeoRankingDataset(TorchCustomDataset): def __init__(self, datasets: Union[Any, List[Any]], diff --git a/modelscope/msdatasets/dataset_cls/custom_datasets/movie_scene_segmentation/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/movie_scene_segmentation/__init__.py new file mode 100644 index 00000000..6157e9e8 --- /dev/null +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/movie_scene_segmentation/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .movie_scene_segmentation_dataset import MovieSceneSegmentationDataset +else: + _import_structure = { + 'movie_scene_segmentation_dataset': ['MovieSceneSegmentationDataset'], + } + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py similarity index 94% rename from modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py index 49991b11..041976dd 100644 --- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py @@ -10,9 +10,8 @@ import torch from torchvision.datasets.folder import pil_loader from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets.builder import \ + CUSTOM_DATASETS from modelscope.utils.constant import Tasks from . import sampler @@ -30,9 +29,9 @@ DATASET_STRUCTURE = { } -@TASK_DATASETS.register_module( - Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert) -class MovieSceneSegmentationDataset(TorchTaskDataset): +@CUSTOM_DATASETS.register_module( + group_key=Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert) +class MovieSceneSegmentationDataset(torch.utils.data.Dataset): """dataset for movie scene segmentation. Args: diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py b/modelscope/msdatasets/dataset_cls/custom_datasets/movie_scene_segmentation/sampler.py similarity index 100% rename from modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/movie_scene_segmentation/sampler.py diff --git a/modelscope/msdatasets/cv/object_detection/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/object_detection/__init__.py similarity index 100% rename from modelscope/msdatasets/cv/object_detection/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/object_detection/__init__.py diff --git a/modelscope/msdatasets/cv/object_detection/detection_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/object_detection/detection_dataset.py similarity index 85% rename from modelscope/msdatasets/cv/object_detection/detection_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/object_detection/detection_dataset.py index c7e45eea..66c11f64 100644 --- a/modelscope/msdatasets/cv/object_detection/detection_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/object_detection/detection_dataset.py @@ -1,20 +1,21 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import os.path as osp from easycv.datasets.detection import DetDataset as _DetDataset from easycv.datasets.detection import \ DetImagesMixDataset as _DetImagesMixDataset -from modelscope.metainfo import Datasets -from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset -from modelscope.msdatasets.task_datasets import TASK_DATASETS +from modelscope.metainfo import CustomDatasets +from modelscope.msdatasets.dataset_cls.custom_datasets import CUSTOM_DATASETS +from modelscope.msdatasets.dataset_cls.custom_datasets.easycv_base import \ + EasyCVBaseDataset from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( - group_key=Tasks.image_object_detection, module_name=Datasets.DetDataset) -@TASK_DATASETS.register_module( - group_key=Tasks.image_segmentation, module_name=Datasets.DetDataset) +@CUSTOM_DATASETS.register_module( + group_key=Tasks.image_object_detection, + module_name=CustomDatasets.DetDataset) +@CUSTOM_DATASETS.register_module( + group_key=Tasks.image_segmentation, module_name=CustomDatasets.DetDataset) class DetDataset(EasyCVBaseDataset, _DetDataset): """EasyCV dataset for object detection. For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/raw.py . @@ -47,12 +48,12 @@ class DetDataset(EasyCVBaseDataset, _DetDataset): _DetDataset.__init__(self, *args, **kwargs) -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( group_key=Tasks.image_object_detection, - module_name=Datasets.DetImagesMixDataset) -@TASK_DATASETS.register_module( + module_name=CustomDatasets.DetImagesMixDataset) +@CUSTOM_DATASETS.register_module( group_key=Tasks.domain_specific_object_detection, - module_name=Datasets.DetImagesMixDataset) + module_name=CustomDatasets.DetImagesMixDataset) class DetImagesMixDataset(EasyCVBaseDataset, _DetImagesMixDataset): """EasyCV dataset for object detection, a wrapper of multiple images mixed dataset. Suitable for training on multiple images mixed data augmentation like diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/__init__.py similarity index 78% rename from modelscope/msdatasets/task_datasets/ocr_detection/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/__init__.py index 5afd1ded..6a3847b9 100644 --- a/modelscope/msdatasets/task_datasets/ocr_detection/__init__.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/__init__.py @@ -1,3 +1,4 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from .data_loader import DataLoader from .image_dataset import ImageDataset +from .measures import QuadMeasurer diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/augmenter.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/augmenter.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/augmenter.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/augmenter.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/data_loader.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/data_loader.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/data_loader.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/data_loader.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/image_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/image_dataset.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/image_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/image_dataset.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/measures/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/measures/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/measures/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/measures/__init__.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/measures/iou_evaluator.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/measures/iou_evaluator.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/measures/iou_evaluator.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/measures/iou_evaluator.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/measures/quad_measurer.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/measures/quad_measurer.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/measures/quad_measurer.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/measures/quad_measurer.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/processes/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/processes/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/__init__.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/processes/augment_data.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/augment_data.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/processes/augment_data.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/augment_data.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/processes/data_process.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/data_process.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/processes/data_process.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/data_process.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/processes/make_border_map.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/make_border_map.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/processes/make_border_map.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/make_border_map.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/processes/make_icdar_data.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/make_icdar_data.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/processes/make_icdar_data.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/make_icdar_data.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/processes/make_seg_detection_data.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/make_seg_detection_data.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/processes/make_seg_detection_data.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/make_seg_detection_data.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/processes/normalize_image.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/normalize_image.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/processes/normalize_image.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/normalize_image.py diff --git a/modelscope/msdatasets/task_datasets/ocr_detection/processes/random_crop_data.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/random_crop_data.py similarity index 100% rename from modelscope/msdatasets/task_datasets/ocr_detection/processes/random_crop_data.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_detection/processes/random_crop_data.py diff --git a/modelscope/msdatasets/task_datasets/ocr_recognition_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_recognition_dataset.py similarity index 87% rename from modelscope/msdatasets/task_datasets/ocr_recognition_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/ocr_recognition_dataset.py index 8be657f0..bc9cd3ca 100644 --- a/modelscope/msdatasets/task_datasets/ocr_recognition_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/ocr_recognition_dataset.py @@ -9,9 +9,10 @@ import torch from PIL import Image from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets.builder import \ + CUSTOM_DATASETS +from modelscope.msdatasets.dataset_cls.custom_datasets.torch_custom_dataset import \ + TorchCustomDataset from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger @@ -29,9 +30,9 @@ def Q2B(uchar): return chr(inside_code) -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.ocr_recognition, module_name=Models.ocr_recognition) -class OCRRecognitionDataset(TorchTaskDataset): +class OCRRecognitionDataset(TorchCustomDataset): def __init__(self, **kwargs): split_config = kwargs['split_config'] diff --git a/modelscope/msdatasets/task_datasets/reds_image_deblurring_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/reds_image_deblurring_dataset.py similarity index 74% rename from modelscope/msdatasets/task_datasets/reds_image_deblurring_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/reds_image_deblurring_dataset.py index 17b731bc..b03c1d06 100644 --- a/modelscope/msdatasets/task_datasets/reds_image_deblurring_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/reds_image_deblurring_dataset.py @@ -3,14 +3,13 @@ import cv2 import numpy as np -from modelscope.metainfo import Datasets -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.sidd_image_denoising.data_utils import ( +from modelscope.metainfo import CustomDatasets +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) +from modelscope.msdatasets.dataset_cls.custom_datasets.sidd_image_denoising.data_utils import ( img2tensor, padding) -from modelscope.msdatasets.task_datasets.sidd_image_denoising.transforms import ( +from modelscope.msdatasets.dataset_cls.custom_datasets.sidd_image_denoising.transforms import ( augment, paired_random_crop) -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset from modelscope.utils.constant import Tasks @@ -18,9 +17,9 @@ def default_loader(path): return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0 -@TASK_DATASETS.register_module( - Tasks.image_deblurring, module_name=Datasets.PairedDataset) -class RedsImageDeblurringDataset(TorchTaskDataset): +@CUSTOM_DATASETS.register_module( + Tasks.image_deblurring, module_name=CustomDatasets.PairedDataset) +class RedsImageDeblurringDataset(TorchCustomDataset): """Paired image dataset for image restoration. """ diff --git a/modelscope/msdatasets/dataset_cls/custom_datasets/referring_video_object_segmentation/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/referring_video_object_segmentation/__init__.py new file mode 100644 index 00000000..7349e494 --- /dev/null +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/referring_video_object_segmentation/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .referring_video_object_segmentation_dataset import ReferringVideoObjectSegmentationDataset +else: + _import_structure = { + 'referring_video_object_segmentation_dataset': + ['MovieSceneSegmentationDataset'], + } + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py similarity index 98% rename from modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py index 8b6d22a4..4493fd96 100644 --- a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py @@ -18,9 +18,8 @@ from tqdm import tqdm from modelscope.metainfo import Models from modelscope.models.cv.referring_video_object_segmentation.utils import \ nested_tensor_from_videos_list -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger from . import transformers as T @@ -33,10 +32,10 @@ def get_image_id(video_id, frame_idx, ref_instance_a2d_id): return image_id -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.referring_video_object_segmentation, module_name=Models.referring_video_object_segmentation) -class ReferringVideoObjectSegmentationDataset(TorchTaskDataset): +class ReferringVideoObjectSegmentationDataset(TorchCustomDataset): def __init__(self, **kwargs): split_config = kwargs['split_config'] diff --git a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/transformers.py b/modelscope/msdatasets/dataset_cls/custom_datasets/referring_video_object_segmentation/transformers.py similarity index 100% rename from modelscope/msdatasets/task_datasets/referring_video_object_segmentation/transformers.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/referring_video_object_segmentation/transformers.py diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/sidd_image_denoising/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/sidd_image_denoising/__init__.py diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py b/modelscope/msdatasets/dataset_cls/custom_datasets/sidd_image_denoising/data_utils.py similarity index 100% rename from modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/sidd_image_denoising/data_utils.py diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py similarity index 87% rename from modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py index 3f0cdae0..9369b991 100644 --- a/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py @@ -4,9 +4,8 @@ import cv2 import numpy as np from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import Tasks from .data_utils import img2tensor, padding from .transforms import augment, paired_random_crop @@ -16,9 +15,9 @@ def default_loader(path): return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0 -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.image_denoising, module_name=Models.nafnet) -class SiddImageDenoisingDataset(TorchTaskDataset): +class SiddImageDenoisingDataset(TorchCustomDataset): """Paired image dataset for image restoration. """ diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/transforms.py b/modelscope/msdatasets/dataset_cls/custom_datasets/sidd_image_denoising/transforms.py similarity index 100% rename from modelscope/msdatasets/task_datasets/sidd_image_denoising/transforms.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/sidd_image_denoising/transforms.py diff --git a/modelscope/msdatasets/task_datasets/text_ranking_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/text_ranking_dataset.py similarity index 92% rename from modelscope/msdatasets/task_datasets/text_ranking_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/text_ranking_dataset.py index 19f07110..46c64bbf 100644 --- a/modelscope/msdatasets/task_datasets/text_ranking_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/text_ranking_dataset.py @@ -1,25 +1,21 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import random -from dataclasses import dataclass -from typing import Any, Dict, List, Tuple, Union +from typing import Any, List, Union import torch -from datasets import Dataset, IterableDataset, concatenate_datasets from torch.utils.data import ConcatDataset -from transformers import DataCollatorWithPadding from modelscope.metainfo import Models +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import ModeKeys, Tasks -from .base import TaskDataset -from .builder import TASK_DATASETS -from .torch_base_dataset import TorchTaskDataset -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( group_key=Tasks.text_ranking, module_name=Models.bert) -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( group_key=Tasks.sentence_embedding, module_name=Models.bert) -class TextRankingDataset(TorchTaskDataset): +class TextRankingDataset(TorchCustomDataset): def __init__(self, datasets: Union[Any, List[Any]], diff --git a/modelscope/msdatasets/dataset_cls/custom_datasets/torch_custom_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/torch_custom_dataset.py new file mode 100644 index 00000000..54ad55b7 --- /dev/null +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/torch_custom_dataset.py @@ -0,0 +1,51 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, List, Union + +import torch.utils.data +from torch.utils.data import ConcatDataset as TorchConcatDataset + +from modelscope.utils.constant import ModeKeys + + +class TorchCustomDataset(torch.utils.data.Dataset): + """The custom dataset base class for all the torch-based task processors. + """ + + def __init__(self, + datasets: Union[Any, List[Any]], + mode=ModeKeys.TRAIN, + preprocessor=None, + **kwargs): + self.trainer = None + self.mode = mode + self.preprocessor = preprocessor + self._inner_dataset = self.prepare_dataset(datasets) + + def __getitem__(self, index) -> Any: + return self.preprocessor( + self._inner_dataset[index] + ) if self.preprocessor else self._inner_dataset[index] + + def __len__(self): + return len(self._inner_dataset) + + def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any: + """Prepare a dataset. + + User can process the input datasets in a whole dataset perspective. + This method gives a default implementation of datasets merging, user can override this + method to write custom logics. + + Args: + datasets: The original dataset(s) + + Returns: A single dataset, which may be created after merging. + + """ + if isinstance(datasets, List): + if len(datasets) == 1: + return datasets[0] + elif len(datasets) > 1: + return TorchConcatDataset(datasets) + else: + return datasets diff --git a/modelscope/msdatasets/task_datasets/veco_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/veco_dataset.py similarity index 91% rename from modelscope/msdatasets/task_datasets/veco_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/veco_dataset.py index df7c6483..047849bc 100644 --- a/modelscope/msdatasets/task_datasets/veco_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/veco_dataset.py @@ -5,13 +5,13 @@ import numpy as np from datasets import Dataset, IterableDataset, concatenate_datasets from modelscope.metainfo import Models +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import Tasks -from .builder import TASK_DATASETS -from .torch_base_dataset import TorchTaskDataset -@TASK_DATASETS.register_module(module_name=Models.veco, group_key=Tasks.nli) -class VecoDataset(TorchTaskDataset): +@CUSTOM_DATASETS.register_module(module_name=Models.veco, group_key=Tasks.nli) +class VecoDataset(TorchCustomDataset): def __init__(self, datasets: Union[Any, List[Any]], diff --git a/modelscope/msdatasets/task_datasets/video_frame_interpolation/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/video_frame_interpolation/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/video_frame_interpolation/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/video_frame_interpolation/__init__.py diff --git a/modelscope/msdatasets/task_datasets/video_frame_interpolation/data_utils.py b/modelscope/msdatasets/dataset_cls/custom_datasets/video_frame_interpolation/data_utils.py similarity index 100% rename from modelscope/msdatasets/task_datasets/video_frame_interpolation/data_utils.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/video_frame_interpolation/data_utils.py diff --git a/modelscope/msdatasets/task_datasets/video_frame_interpolation/video_frame_interpolation_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/video_frame_interpolation/video_frame_interpolation_dataset.py similarity index 79% rename from modelscope/msdatasets/task_datasets/video_frame_interpolation/video_frame_interpolation_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/video_frame_interpolation/video_frame_interpolation_dataset.py index 44b965a7..6f47906d 100644 --- a/modelscope/msdatasets/task_datasets/video_frame_interpolation/video_frame_interpolation_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/video_frame_interpolation/video_frame_interpolation_dataset.py @@ -1,16 +1,13 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from collections import defaultdict - import cv2 import numpy as np import torch from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset -from modelscope.msdatasets.task_datasets.video_frame_interpolation.data_utils import ( +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) +from modelscope.msdatasets.dataset_cls.custom_datasets.video_frame_interpolation.data_utils import ( img2tensor, img_padding) from modelscope.utils.constant import Tasks @@ -19,10 +16,10 @@ def default_loader(path): return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.video_frame_interpolation, module_name=Models.video_frame_interpolation) -class VideoFrameInterpolationDataset(TorchTaskDataset): +class VideoFrameInterpolationDataset(TorchCustomDataset): """Dataset for video frame-interpolation. """ diff --git a/modelscope/msdatasets/task_datasets/video_stabilization/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/video_stabilization/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/video_stabilization/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/video_stabilization/__init__.py diff --git a/modelscope/msdatasets/task_datasets/video_stabilization/video_stabilization_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/video_stabilization/video_stabilization_dataset.py similarity index 71% rename from modelscope/msdatasets/task_datasets/video_stabilization/video_stabilization_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/video_stabilization/video_stabilization_dataset.py index b0e6bdef..a0e0604c 100644 --- a/modelscope/msdatasets/task_datasets/video_stabilization/video_stabilization_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/video_stabilization/video_stabilization_dataset.py @@ -1,15 +1,14 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import Tasks -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.video_stabilization, module_name=Models.video_stabilization) -class VideoStabilizationDataset(TorchTaskDataset): +class VideoStabilizationDataset(TorchCustomDataset): """Paired video dataset for video stabilization. """ diff --git a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/video_summarization_dataset.py similarity index 94% rename from modelscope/msdatasets/task_datasets/video_summarization_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/video_summarization_dataset.py index 02639be8..4d6e0155 100644 --- a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/video_summarization_dataset.py @@ -8,11 +8,11 @@ import json import numpy as np import torch -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import \ + TorchCustomDataset -class VideoSummarizationDataset(TorchTaskDataset): +class VideoSummarizationDataset(TorchCustomDataset): def __init__(self, mode, opt, root_dir): self.mode = mode diff --git a/modelscope/msdatasets/task_datasets/video_super_resolution/__init__.py b/modelscope/msdatasets/dataset_cls/custom_datasets/video_super_resolution/__init__.py similarity index 100% rename from modelscope/msdatasets/task_datasets/video_super_resolution/__init__.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/video_super_resolution/__init__.py diff --git a/modelscope/msdatasets/task_datasets/video_super_resolution/video_super_resolution_dataset.py b/modelscope/msdatasets/dataset_cls/custom_datasets/video_super_resolution/video_super_resolution_dataset.py similarity index 89% rename from modelscope/msdatasets/task_datasets/video_super_resolution/video_super_resolution_dataset.py rename to modelscope/msdatasets/dataset_cls/custom_datasets/video_super_resolution/video_super_resolution_dataset.py index 69faa527..86e07db1 100644 --- a/modelscope/msdatasets/task_datasets/video_super_resolution/video_super_resolution_dataset.py +++ b/modelscope/msdatasets/dataset_cls/custom_datasets/video_super_resolution/video_super_resolution_dataset.py @@ -7,9 +7,8 @@ import numpy as np import torch from modelscope.metainfo import Models -from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import ( + CUSTOM_DATASETS, TorchCustomDataset) from modelscope.utils.constant import Tasks @@ -42,9 +41,9 @@ def img2tensor(imgs, bgr2rgb=True, float32=True): return _totensor(imgs, bgr2rgb, float32) -@TASK_DATASETS.register_module( +@CUSTOM_DATASETS.register_module( Tasks.video_super_resolution, module_name=Models.real_basicvsr) -class VideoSuperResolutionDataset(TorchTaskDataset): +class VideoSuperResolutionDataset(TorchCustomDataset): """single video dataset for video super-resolution. """ diff --git a/modelscope/msdatasets/dataset_cls/dataset.py b/modelscope/msdatasets/dataset_cls/dataset.py index 57ee8150..4acf51b1 100644 --- a/modelscope/msdatasets/dataset_cls/dataset.py +++ b/modelscope/msdatasets/dataset_cls/dataset.py @@ -14,15 +14,19 @@ logger = get_logger() class ExternalDataset(object): + """Dataset class for custom datasets.""" def __init__(self, split_path_dict, config_kwargs): self.split_path_dict = split_path_dict self.config_kwargs = copy.deepcopy(config_kwargs) self.config_kwargs.update({'split_config': split_path_dict}) - self.ext_dataset = None + # dataset for specific extensions + self.spec_extension_dataset = None self.split_data_files = {k: [] for k, _ in split_path_dict.items()} - file_ext = '' + self.custom_map = {} + # the extension of file + file_ext = '' for split_name, split_dir in split_path_dict.items(): if isinstance(split_dir, str) and os.path.isdir(split_dir): split_file_names = os.listdir(split_dir) @@ -52,25 +56,27 @@ class ExternalDataset(object): if file_ext: file_ext = EXTENSIONS_TO_LOAD.get(file_ext) - self.ext_dataset = datasets.load_dataset( + self.spec_extension_dataset = datasets.load_dataset( file_ext, data_files=self.split_data_files, **config_kwargs) def __len__(self): - return len(self.split_path_dict - ) if not self.ext_dataset else self.ext_dataset.__len__() + return len( + self.split_path_dict + ) if not self.spec_extension_dataset else self.spec_extension_dataset.__len__( + ) def __getitem__(self, item): - if not self.ext_dataset: + if not self.spec_extension_dataset: return self.split_path_dict.get(item) else: - return self.ext_dataset.__getitem__(item) + return self.spec_extension_dataset.__getitem__(item) def __iter__(self): - if not self.ext_dataset: + if not self.spec_extension_dataset: for k, v in self.split_path_dict.items(): yield k, v else: - for k, v in self.ext_dataset.items(): + for k, v in self.spec_extension_dataset.items(): yield k, v @@ -99,3 +105,6 @@ class NativeIterableDataset(IterableDataset): entity = ret yield entity + + def __len__(self): + return 1 diff --git a/modelscope/msdatasets/meta/data_meta_config.py b/modelscope/msdatasets/meta/data_meta_config.py index 401a8e14..7f97108b 100644 --- a/modelscope/msdatasets/meta/data_meta_config.py +++ b/modelscope/msdatasets/meta/data_meta_config.py @@ -2,7 +2,35 @@ class DataMetaConfig(object): - """Modelscope data-meta config class.""" + """Modelscope data-meta config class. + + Attributes: + dataset_scripts(str): The local path of dataset scripts. + dataset_formation(:obj:`enum.Enum`): Dataset formation, refer to modelscope.utils.constant.DatasetFormations. + meta_cache_dir(str): Meta cache path. + meta_data_files(dict): Meta data mapping, Example: {'test': 'https://xxx/mytest.csv'} + zip_data_files(dict): Data files mapping, Example: {'test': 'pictures.zip'} + meta_args_map(dict): Meta arguments mapping, Example: {'test': {'file': 'pictures.zip'}, ...} + target_dataset_structure(dict): Dataset Structure, like + { + "default":{ + "train":{ + "meta":"my_train.csv", + "file":"pictures.zip" + } + }, + "subsetA":{ + "test":{ + "meta":"mytest.csv", + "file":"pictures.zip" + } + } + } + dataset_py_script(str): The python script path of dataset. + meta_type_map(dict): The custom dataset mapping in meta data, + Example: {"type": "MovieSceneSegmentationCustomDataset", + "preprocessor": "movie-scene-segmentation-preprocessor"} + """ def __init__(self): self.dataset_scripts = None @@ -13,3 +41,4 @@ class DataMetaConfig(object): self.meta_args_map = None self.target_dataset_structure = None self.dataset_py_script = None + self.meta_type_map = {} diff --git a/modelscope/msdatasets/meta/data_meta_manager.py b/modelscope/msdatasets/meta/data_meta_manager.py index bba46e84..d90b8d5e 100644 --- a/modelscope/msdatasets/meta/data_meta_manager.py +++ b/modelscope/msdatasets/meta/data_meta_manager.py @@ -75,7 +75,7 @@ class DataMetaManager(object): elif download_mode == DownloadMode.FORCE_REDOWNLOAD: # Clean meta-files if os.path.exists(meta_cache_dir) and os.listdir(meta_cache_dir): - shutil.rmtree(meta_cache_dir) + shutil.rmtree(meta_cache_dir, ignore_errors=True) # Re-download meta-files with FileLock(lock_file=lock_file_path): os.makedirs(meta_cache_dir, exist_ok=True) @@ -129,12 +129,13 @@ class DataMetaManager(object): else: target_subset_name, target_dataset_structure = get_target_dataset_structure( dataset_json, subset_name, split) - meta_map, file_map, args_map = get_dataset_files( + meta_map, file_map, args_map, type_map = get_dataset_files( target_dataset_structure, dataset_name, namespace, version) data_meta_config.meta_data_files = meta_map data_meta_config.zip_data_files = file_map data_meta_config.meta_args_map = args_map + data_meta_config.meta_type_map = type_map data_meta_config.target_dataset_structure = target_dataset_structure self.dataset_context_config.data_meta_config = data_meta_config diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index f1c40e12..06f47874 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -16,19 +16,27 @@ from modelscope.msdatasets.context.dataset_context_config import \ from modelscope.msdatasets.data_loader.data_loader_manager import ( LocalDataLoaderManager, LocalDataLoaderType, RemoteDataLoaderManager, RemoteDataLoaderType) +from modelscope.msdatasets.dataset_cls.custom_datasets.builder import \ + build_custom_dataset from modelscope.msdatasets.dataset_cls.dataset import (ExternalDataset, NativeIterableDataset) -from modelscope.msdatasets.task_datasets.builder import build_task_dataset from modelscope.msdatasets.utils.delete_utils import DatasetDeleteManager from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager -from modelscope.utils.config import ConfigDict +from modelscope.preprocessors import build_preprocessor +from modelscope.utils.config import Config, ConfigDict from modelscope.utils.config_ds import MS_DATASETS_CACHE from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, - DEFAULT_DATASET_REVISION, DownloadMode, - Hubs, UploadMode) + DEFAULT_DATASET_REVISION, ConfigFields, + DownloadMode, Hubs, ModeKeys, Tasks, + UploadMode) from modelscope.utils.import_utils import is_tf_available, is_torch_available from modelscope.utils.logger import get_logger +try: + from tensorflow.data import Dataset as TfDataset +except Exception as e: + print(e) + logger = get_logger() @@ -53,6 +61,7 @@ class MsDataset: """ # the underlying huggingface Dataset _hf_ds = None + _dataset_context_config: DatasetContextConfig = None def __init__(self, ds_instance: Union[Dataset, IterableDataset, ExternalDataset], @@ -63,6 +72,7 @@ class MsDataset: f'"target" must be a column of the dataset({list(self._hf_ds.features.keys())}, but got {target}' ) self.target = target + self.is_custom = False def __iter__(self): for item in self._hf_ds: @@ -77,10 +87,10 @@ class MsDataset: def __len__(self): if isinstance(self._hf_ds, IterableDataset) or isinstance( self._hf_ds, NativeIterableDataset): - logger.error( - f'object of type `{self._hf_ds.__class__.__name__}` has no __len__()' + logger.warning( + f'object of type `{self._hf_ds.__class__.__name__}` has default length 1' ) - return None + return 1 return len(self._hf_ds) @property @@ -163,6 +173,7 @@ class MsDataset: REUSE_DATASET_IF_EXISTS, cache_dir: Optional[str] = MS_DATASETS_CACHE, use_streaming: Optional[bool] = False, + custom_cfg: Optional[Config] = Config(), **config_kwargs, ) -> Union[dict, 'MsDataset', NativeIterableDataset]: """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. @@ -191,6 +202,8 @@ class MsDataset: use_streaming (bool, Optional): If set to True, no need to download all data files. Instead, it streams the data progressively, and returns NativeIterableDataset or a dict of NativeIterableDataset. + custom_cfg (str, Optional): Model configuration, this can be used for custom datasets. + see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3 **config_kwargs (additional keyword arguments): Keyword arguments to be passed Returns: @@ -245,305 +258,44 @@ class MsDataset: dataset_inst = LocalDataLoaderManager( dataset_context_config).load_dataset( LocalDataLoaderType.HF_DATA_LOADER) - return MsDataset.to_ms_dataset(dataset_inst, target=target) + dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target) + if isinstance(dataset_inst, MsDataset): + dataset_inst._dataset_context_config = dataset_context_config + if custom_cfg: + dataset_inst.to_custom_dataset( + custom_cfg=custom_cfg, **config_kwargs) + dataset_inst.is_custom = True + return dataset_inst # Load from the huggingface hub elif hub == Hubs.huggingface: dataset_inst = RemoteDataLoaderManager( dataset_context_config).load_dataset( RemoteDataLoaderType.HF_DATA_LOADER) - return MsDataset.to_ms_dataset(dataset_inst, target=target) + dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target) + dataset_inst._dataset_context_config = dataset_context_config + if custom_cfg: + dataset_inst.to_custom_dataset( + custom_cfg=custom_cfg, **config_kwargs) + dataset_inst.is_custom = True + return dataset_inst # Load from the modelscope hub elif hub == Hubs.modelscope: - dataset_inst = RemoteDataLoaderManager( - dataset_context_config).load_dataset( - RemoteDataLoaderType.MS_DATA_LOADER) - return MsDataset.to_ms_dataset(dataset_inst, target=target) + remote_dataloader_manager = RemoteDataLoaderManager( + dataset_context_config) + dataset_inst = remote_dataloader_manager.load_dataset( + RemoteDataLoaderType.MS_DATA_LOADER) + dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target) + if isinstance(dataset_inst, MsDataset): + dataset_inst._dataset_context_config = remote_dataloader_manager.dataset_context_config + if custom_cfg: + dataset_inst.to_custom_dataset( + custom_cfg=custom_cfg, **config_kwargs) + dataset_inst.is_custom = True + return dataset_inst else: raise 'Please adjust input args to specify a loading mode, we support following scenes: ' \ 'loading from local disk, huggingface hub and modelscope hub.' - def to_torch_dataset_with_processors( - self, - preprocessors: Union[Callable, List[Callable]], - columns: Union[str, List[str]] = None, - to_tensor: bool = True, - ): - import torch - preprocessor_list = preprocessors if isinstance( - preprocessors, list) else [preprocessors] - - columns = format_list(columns) - - columns = [ - key for key in self._hf_ds.features.keys() if key in columns - ] - retained_columns = [] - if to_tensor: - sample = next(iter(self._hf_ds)) - - sample_res = {k: np.array(sample[k]) for k in columns} - for processor in preprocessor_list: - sample_res.update( - {k: np.array(v) - for k, v in processor(sample).items()}) - - def is_numpy_number(value): - return np.issubdtype(value.dtype, np.integer) or np.issubdtype( - value.dtype, np.floating) - - for k in sample_res.keys(): - if not is_numpy_number(sample_res[k]): - logger.warning( - f'Data of column {k} is non-numeric, will be removed') - continue - retained_columns.append(k) - - class MsMapDataset(torch.utils.data.Dataset): - - def __init__(self, dataset: Iterable, preprocessor_list, - retained_columns, columns, to_tensor): - super(MsDataset).__init__() - self.dataset = dataset - self.preprocessor_list = preprocessor_list - self.to_tensor = to_tensor - self.retained_columns = retained_columns - self.columns = columns - - def __len__(self): - return len(self.dataset) - - def type_converter(self, x): - import torch - if self.to_tensor and not isinstance(x, torch.Tensor): - return torch.tensor(x) - else: - return x - - def __getitem__(self, index): - item_dict = self.dataset[index] - res = { - k: self.type_converter(item_dict[k]) - for k in self.columns - if (not self.to_tensor) or k in self.retained_columns - } - for preprocessor in self.preprocessor_list: - res.update({ - k: self.type_converter(v) - for k, v in preprocessor(item_dict).items() - if (not self.to_tensor) or k in self.retained_columns - }) - return res - - return MsMapDataset(self._hf_ds, preprocessor_list, retained_columns, - columns, to_tensor) - - def to_torch_dataset( - self, - columns: Union[str, List[str]] = None, - preprocessors: Union[Callable, List[Callable]] = None, - task_name: str = None, - task_data_config: ConfigDict = None, - to_tensor: bool = True, - **format_kwargs, - ): - """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to - torch.utils.data.DataLoader. - - Args: - preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process - every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict - will be used as a field of torch.utils.data.Dataset. - columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if - `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column. - If the `preprocessors` is not None, the output fields of processors will also be added. - task_name (str, default None): task name, refer to :obj:`Tasks` for more details - task_data_config (ConfigDict, default None): config dict for model object. - to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not. - format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. - - Returns: - :class:`tf.data.Dataset` - - """ - if not is_torch_available(): - raise ImportError( - 'The function to_torch_dataset requires pytorch to be installed' - ) - if isinstance(self._hf_ds, ExternalDataset): - task_data_config.update({'preprocessor': preprocessors}) - task_data_config.update(self._hf_ds.config_kwargs) - return build_task_dataset(task_data_config, task_name) - if preprocessors is not None: - return self.to_torch_dataset_with_processors( - preprocessors, columns=columns, to_tensor=to_tensor) - else: - self._hf_ds.reset_format() - self._hf_ds.set_format( - type='torch', columns=columns, format_kwargs=format_kwargs) - return self._hf_ds - - def to_tf_dataset_with_processors( - self, - batch_size: int, - shuffle: bool, - preprocessors: Union[Callable, List[Callable]], - drop_remainder: bool = None, - prefetch: bool = True, - label_cols: Union[str, List[str]] = None, - columns: Union[str, List[str]] = None, - ): - preprocessor_list = preprocessors if isinstance( - preprocessors, list) else [preprocessors] - - label_cols = format_list(label_cols) - columns = format_list(columns) - cols_to_retain = list(set(label_cols + columns)) - retained_columns = [ - key for key in self._hf_ds.features.keys() if key in cols_to_retain - ] - import tensorflow as tf - tf_dataset = tf.data.Dataset.from_tensor_slices( - np.arange(len(self._hf_ds), dtype=np.int64)) - if shuffle: - tf_dataset = tf_dataset.shuffle(buffer_size=len(self._hf_ds)) - - def func(i, return_dict=False): - i = int(i) - res = {k: np.array(self._hf_ds[i][k]) for k in retained_columns} - for preprocessor in preprocessor_list: - # TODO preprocessor output may have the same key - res.update({ - k: np.array(v) - for k, v in preprocessor(self._hf_ds[i]).items() - }) - if return_dict: - return res - return tuple(list(res.values())) - - sample_res = func(0, True) - - @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)]) - def fetch_function(i): - output = tf.numpy_function( - func, - inp=[i], - Tout=[ - tf.dtypes.as_dtype(val.dtype) - for val in sample_res.values() - ], - ) - return {key: output[i] for i, key in enumerate(sample_res)} - - from tensorflow.data.experimental import AUTOTUNE - tf_dataset = tf_dataset.map( - fetch_function, num_parallel_calls=AUTOTUNE) - if label_cols: - - def split_features_and_labels(input_batch): - labels = { - key: tensor - for key, tensor in input_batch.items() if key in label_cols - } - if len(input_batch) == 1: - input_batch = next(iter(input_batch.values())) - if len(labels) == 1: - labels = next(iter(labels.values())) - return input_batch, labels - - tf_dataset = tf_dataset.map(split_features_and_labels) - - elif len(columns) == 1: - tf_dataset = tf_dataset.map(lambda x: next(iter(x.values()))) - if batch_size > 1: - tf_dataset = tf_dataset.batch( - batch_size, drop_remainder=drop_remainder) - - if prefetch: - tf_dataset = tf_dataset.prefetch(AUTOTUNE) - return tf_dataset - - def to_tf_dataset( - self, - batch_size: int, - shuffle: bool, - preprocessors: Union[Callable, List[Callable]] = None, - columns: Union[str, List[str]] = None, - collate_fn: Callable = None, - drop_remainder: bool = None, - collate_fn_args: Dict[str, Any] = None, - label_cols: Union[str, List[str]] = None, - prefetch: bool = True, - ): - """Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like - model.fit() or model.predict(). - - Args: - batch_size (int): Number of samples in a single batch. - shuffle(bool): Shuffle the dataset order. - preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process - every sample of the dataset. The output type of processors is dict, and each field of the dict will be - used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn` - shouldn't be None. - columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None, - the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of - processors will also be added. - collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If - the `preprocessors` is None, the `collate_fn` shouldn't be None. - drop_remainder(bool, default None): Drop the last incomplete batch when loading. - collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`. - label_cols (str or List[str], defalut None): Dataset column(s) to load as labels. - prefetch (bool, default True): Prefetch data. - - Returns: - :class:`tf.data.Dataset` - - """ - if not is_tf_available(): - raise ImportError( - 'The function to_tf_dataset requires Tensorflow to be installed.' - ) - if preprocessors is not None: - return self.to_tf_dataset_with_processors( - batch_size, - shuffle, - preprocessors, - drop_remainder=drop_remainder, - prefetch=prefetch, - label_cols=label_cols, - columns=columns) - - if collate_fn is None: - logger.error( - 'The `preprocessors` and the `collate_fn` should`t be both None.' - ) - return None - self._hf_ds.reset_format() - return self._hf_ds.to_tf_dataset( - columns, - batch_size, - shuffle, - collate_fn, - drop_remainder=drop_remainder, - collate_fn_args=collate_fn_args, - label_cols=label_cols, - prefetch=prefetch) - - def to_hf_dataset(self) -> Dataset: - self._hf_ds.reset_format() - return self._hf_ds - - def remap_columns(self, column_mapping: Dict[str, str]) -> Dataset: - """ - Rename columns and return the underlying hf dataset directly - TODO: support native MsDataset column rename. - Args: - column_mapping: the mapping of the original and new column names - Returns: - underlying hf dataset - """ - self._hf_ds.reset_format() - return self._hf_ds.rename_columns(column_mapping) - @staticmethod def upload( object_name: str, @@ -695,3 +447,358 @@ class MsDataset: resp_msg = _delete_manager.delete(object_name=object_name) logger.info(f'Object {object_name} successfully removed!') return resp_msg + + def to_torch_dataset( + self, + columns: Union[str, List[str]] = None, + preprocessors: Union[Callable, List[Callable]] = None, + task_name: str = None, + data_config: ConfigDict = None, + to_tensor: bool = True, + **format_kwargs, + ): + """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to + torch.utils.data.DataLoader. + + Args: + preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process + every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict + will be used as a field of torch.utils.data.Dataset. + columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if + `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column. + If the `preprocessors` is not None, the output fields of processors will also be added. + task_name (str, default None): task name, refer to :obj:`Tasks` for more details + data_config (ConfigDict, default None): config dict for model object. + Attributes of ConfigDict: + `preprocessor` (Callable, List[Callable], optional): preprocessors to deal with dataset + `type` (str): the type of task + `split_config` (dict, optional): get the split config for ExternalDataset + `test_mode` (bool, optional): is test mode or not + to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not. + format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. + + Returns: + :class:`torch.utils.data.Dataset` + + """ + if not is_torch_available(): + raise ImportError( + 'The function to_torch_dataset requires pytorch to be installed' + ) + if isinstance(self._hf_ds, ExternalDataset): + data_config.update({'preprocessor': preprocessors}) + data_config.update(self._hf_ds.config_kwargs) + return build_custom_dataset(data_config, task_name) + if preprocessors is not None: + return self._to_torch_dataset_with_processors( + preprocessors, columns=columns, to_tensor=to_tensor) + else: + self._hf_ds.reset_format() + self._hf_ds.set_format( + type='torch', columns=columns, format_kwargs=format_kwargs) + return self._hf_ds + + def to_tf_dataset( + self, + batch_size: int, + shuffle: bool, + preprocessors: Union[Callable, List[Callable]] = None, + columns: Union[str, List[str]] = None, + collate_fn: Callable = None, + drop_remainder: bool = None, + collate_fn_args: Dict[str, Any] = None, + label_cols: Union[str, List[str]] = None, + prefetch: bool = True, + ): + """Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like + model.fit() or model.predict(). + + Args: + batch_size (int): Number of samples in a single batch. + shuffle(bool): Shuffle the dataset order. + preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process + every sample of the dataset. The output type of processors is dict, and each field of the dict will be + used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn` + shouldn't be None. + columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None, + the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of + processors will also be added. + collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If + the `preprocessors` is None, the `collate_fn` shouldn't be None. + drop_remainder(bool, default None): Drop the last incomplete batch when loading. + collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`. + label_cols (str or List[str], defalut None): Dataset column(s) to load as labels. + prefetch (bool, default True): Prefetch data. + + Returns: + :class:`tf.data.Dataset` + + """ + if not is_tf_available(): + raise ImportError( + 'The function to_tf_dataset requires Tensorflow to be installed.' + ) + if preprocessors is not None: + return self._to_tf_dataset_with_processors( + batch_size, + shuffle, + preprocessors, + drop_remainder=drop_remainder, + prefetch=prefetch, + label_cols=label_cols, + columns=columns) + + if collate_fn is None: + logger.error( + 'The `preprocessors` and the `collate_fn` should`t be both None.' + ) + return None + self._hf_ds.reset_format() + return self._hf_ds.to_tf_dataset( + columns, + batch_size, + shuffle, + collate_fn, + drop_remainder=drop_remainder, + collate_fn_args=collate_fn_args, + label_cols=label_cols, + prefetch=prefetch) + + def to_hf_dataset(self) -> Dataset: + self._hf_ds.reset_format() + return self._hf_ds + + def remap_columns(self, column_mapping: Dict[str, str]) -> Dataset: + """ + Rename columns and return the underlying hf dataset directly + TODO: support native MsDataset column rename. + Args: + column_mapping: the mapping of the original and new column names + Returns: + underlying hf dataset + """ + self._hf_ds.reset_format() + return self._hf_ds.rename_columns(column_mapping) + + def _to_torch_dataset_with_processors( + self, + preprocessors: Union[Callable, List[Callable]], + columns: Union[str, List[str]] = None, + to_tensor: bool = True, + ): + preprocessor_list = preprocessors if isinstance( + preprocessors, list) else [preprocessors] + + columns = format_list(columns) + + columns = [ + key for key in self._hf_ds.features.keys() if key in columns + ] + retained_columns = [] + if to_tensor: + sample = next(iter(self._hf_ds)) + + sample_res = {k: np.array(sample[k]) for k in columns} + for processor in preprocessor_list: + sample_res.update( + {k: np.array(v) + for k, v in processor(sample).items()}) + + def is_numpy_number(value): + return np.issubdtype(value.dtype, np.integer) or np.issubdtype( + value.dtype, np.floating) + + for k in sample_res.keys(): + if not is_numpy_number(sample_res[k]): + logger.warning( + f'Data of column {k} is non-numeric, will be removed') + continue + retained_columns.append(k) + + import torch + + class MsMapDataset(torch.utils.data.Dataset): + + def __init__(self, dataset: Iterable, preprocessor_list, + retained_columns, columns, to_tensor): + super(MsDataset).__init__() + self.dataset = dataset + self.preprocessor_list = preprocessor_list + self.to_tensor = to_tensor + self.retained_columns = retained_columns + self.columns = columns + + def __len__(self): + return len(self.dataset) + + def type_converter(self, x): + if self.to_tensor: + return torch.tensor(x) + else: + return x + + def __getitem__(self, index): + item_dict = self.dataset[index] + res = { + k: self.type_converter(item_dict[k]) + for k in self.columns + if (not self.to_tensor) or k in self.retained_columns + } + for preprocessor in self.preprocessor_list: + res.update({ + k: self.type_converter(v) + for k, v in preprocessor(item_dict).items() + if (not self.to_tensor) or k in self.retained_columns + }) + return res + + return MsMapDataset(self._hf_ds, preprocessor_list, retained_columns, + columns, to_tensor) + + def _to_tf_dataset_with_processors( + self, + batch_size: int, + shuffle: bool, + preprocessors: Union[Callable, List[Callable]], + drop_remainder: bool = None, + prefetch: bool = True, + label_cols: Union[str, List[str]] = None, + columns: Union[str, List[str]] = None, + ): + preprocessor_list = preprocessors if isinstance( + preprocessors, list) else [preprocessors] + + label_cols = format_list(label_cols) + columns = format_list(columns) + cols_to_retain = list(set(label_cols + columns)) + retained_columns = [ + key for key in self._hf_ds.features.keys() if key in cols_to_retain + ] + import tensorflow as tf + tf_dataset = tf.data.Dataset.from_tensor_slices( + np.arange(len(self._hf_ds), dtype=np.int64)) + if shuffle: + tf_dataset = tf_dataset.shuffle(buffer_size=len(self._hf_ds)) + + def func(i, return_dict=False): + i = int(i) + res = {k: np.array(self._hf_ds[i][k]) for k in retained_columns} + for preprocessor in preprocessor_list: + # TODO preprocessor output may have the same key + res.update({ + k: np.array(v) + for k, v in preprocessor(self._hf_ds[i]).items() + }) + if return_dict: + return res + return tuple(list(res.values())) + + sample_res = func(0, True) + + @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)]) + def fetch_function(i): + output = tf.numpy_function( + func, + inp=[i], + Tout=[ + tf.dtypes.as_dtype(val.dtype) + for val in sample_res.values() + ], + ) + return {key: output[i] for i, key in enumerate(sample_res)} + + from tensorflow.data.experimental import AUTOTUNE + tf_dataset = tf_dataset.map( + fetch_function, num_parallel_calls=AUTOTUNE) + if label_cols: + + def split_features_and_labels(input_batch): + labels = { + key: tensor + for key, tensor in input_batch.items() if key in label_cols + } + if len(input_batch) == 1: + input_batch = next(iter(input_batch.values())) + if len(labels) == 1: + labels = next(iter(labels.values())) + return input_batch, labels + + tf_dataset = tf_dataset.map(split_features_and_labels) + + elif len(columns) == 1: + tf_dataset = tf_dataset.map(lambda x: next(iter(x.values()))) + if batch_size > 1: + tf_dataset = tf_dataset.batch( + batch_size, drop_remainder=drop_remainder) + + if prefetch: + tf_dataset = tf_dataset.prefetch(AUTOTUNE) + return tf_dataset + + def to_custom_dataset(self, + custom_cfg: Config, + preprocessor=None, + mode=None, + **kwargs): + """Convert the input datasets to specific custom datasets by given model configuration and preprocessor. + + Args: + custom_cfg (Config): The model configuration for custom datasets. + preprocessor (Preprocessor, Optional): Preprocessor for data samples. + mode (str, Optional): See modelscope.utils.constant.ModeKeys + + Returns: + `MsDataset` + """ + + if not is_torch_available(): + raise ImportError( + 'The function to_custom_dataset requires pytorch to be installed' + ) + if not custom_cfg: + return + + # Set the flag that it has been converted to custom dataset + self.is_custom = True + + # Check mode + if mode is None: + if 'mode' in kwargs: + mode = kwargs.get('mode') + + # Parse cfg + ds_cfg_key = 'train' if mode == ModeKeys.TRAIN else 'val' + data_cfg = custom_cfg.safe_get(f'dataset.{ds_cfg_key}') + if data_cfg is None: + data_cfg = ConfigDict(type=custom_cfg.model.type) if hasattr( + custom_cfg, ConfigFields.model) else ConfigDict(type=None) + data_cfg.update(dict(mode=mode)) + + # Get preprocessors from custom_cfg + task_name = custom_cfg.task + if 'task' in kwargs: + task_name = kwargs.pop('task') + field_name = Tasks.find_field_by_task(task_name) + if 'field' in kwargs: + field_name = kwargs.pop('field') + if preprocessor is None and hasattr(custom_cfg, 'preprocessor'): + preprocessor_cfg = custom_cfg.preprocessor + if preprocessor_cfg: + preprocessor = build_preprocessor(preprocessor_cfg, field_name) + + # Build custom dataset + if isinstance(self._hf_ds, ExternalDataset): + data_cfg.update(dict(preprocessor=preprocessor)) + data_cfg.update(self._hf_ds.config_kwargs) + self._hf_ds = build_custom_dataset( + cfg=data_cfg, task_name=custom_cfg.task) + return + + if preprocessor is not None: + to_tensor = kwargs.get('to_tensor', True) + self._hf_ds = self._to_torch_dataset_with_processors( + preprocessors=preprocessor, to_tensor=to_tensor) + else: + self._hf_ds.reset_format() + self._hf_ds.set_format(type='torch') + return diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py deleted file mode 100644 index 8c8cbdf2..00000000 --- a/modelscope/msdatasets/task_datasets/__init__.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from typing import TYPE_CHECKING - -from modelscope.utils.import_utils import LazyImportModule, is_torch_available - -if TYPE_CHECKING: - from .base import TaskDataset - from .builder import TASK_DATASETS, build_task_dataset - from .torch_base_dataset import TorchTaskDataset - from .veco_dataset import VecoDataset - from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset - from .movie_scene_segmentation import MovieSceneSegmentationDataset - from .video_summarization_dataset import VideoSummarizationDataset - from .language_guided_video_summarization_dataset import LanguageGuidedVideoSummarizationDataset - from .image_inpainting import ImageInpaintingDataset - from .ocr_recognition_dataset import OCRRecognitionDataset - from .text_ranking_dataset import TextRankingDataset - from .referring_video_object_segmentation import ReferringVideoObjectSegmentationDataset - from .bad_image_detecting import BadImageDetectingDataset - -else: - _import_structure = { - 'base': ['TaskDataset'], - 'builder': ['TASK_DATASETS', 'build_task_dataset'], - 'torch_base_dataset': ['TorchTaskDataset'], - 'text_ranking_dataset': ['TextRankingDataset'], - 'veco_dataset': ['VecoDataset'], - 'image_instance_segmentation_coco_dataset': - ['ImageInstanceSegmentationCocoDataset'], - 'video_summarization_dataset': ['VideoSummarizationDataset'], - 'language_guided_video_summarization_dataset': - ['LanguageGuidedVideoSummarizationDataset'], - 'movie_scene_segmentation': ['MovieSceneSegmentationDataset'], - 'image_inpainting': ['ImageInpaintingDataset'], - 'ocr_recognition_dataset': ['OCRRecognitionDataset'], - 'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'], - 'image_portrait_enhancement_dataset': - ['ImagePortraitEnhancementDataset'], - 'referring_video_object_segmentation': - ['ReferringVideoObjectSegmentationDataset'], - 'bad_image_detecting': ['BadImageDetectingDataset'], - } - import sys - - sys.modules[__name__] = LazyImportModule( - __name__, - globals()['__file__'], - _import_structure, - module_spec=__spec__, - extra_objects={}, - ) diff --git a/modelscope/msdatasets/task_datasets/base.py b/modelscope/msdatasets/task_datasets/base.py deleted file mode 100644 index 39b791b1..00000000 --- a/modelscope/msdatasets/task_datasets/base.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from abc import ABC, abstractmethod -from typing import Any, List, Tuple, Union - - -class TaskDataset(ABC): - """The task dataset base class for all the task specific dataset processors. - """ - - def __init__(self, - datasets: Union[Any, List[Any]], - mode, - preprocessor=None, - **kwargs): - super().__init__() - self.mode = mode - self.preprocessor = preprocessor - self._inner_dataset = self.prepare_dataset(datasets) - - @abstractmethod - def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any: - """Prepare a dataset. - - User can process the input datasets in a whole dataset perspective. - This method also helps to merge several datasets to one. - - Args: - datasets: The original dataset(s) - - Returns: A single dataset, which may be created after merging. - - """ - pass - - @abstractmethod - def prepare_sample(self, data): - """Preprocess the data fetched from the inner_dataset. - - If the preprocessor is None, the original data will be returned, else the preprocessor will be called. - User can override this method to implement custom logics. - - Args: - data: The data fetched from the dataset. - - Returns: The processed data. - - """ - pass diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py b/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py deleted file mode 100644 index 732a1bd7..00000000 --- a/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from .image_inpainting_dataset import ImageInpaintingDataset diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py deleted file mode 100644 index b1bc40f8..00000000 --- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from .movie_scene_segmentation_dataset import MovieSceneSegmentationDataset diff --git a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/__init__.py b/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/__init__.py deleted file mode 100644 index 7c1b724e..00000000 --- a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from .referring_video_object_segmentation_dataset import \ - ReferringVideoObjectSegmentationDataset diff --git a/modelscope/msdatasets/task_datasets/torch_base_dataset.py b/modelscope/msdatasets/task_datasets/torch_base_dataset.py deleted file mode 100644 index 4d82b741..00000000 --- a/modelscope/msdatasets/task_datasets/torch_base_dataset.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, List, Tuple, Union - -from torch.utils.data import ConcatDataset, Dataset - -from .base import TaskDataset - - -class TorchTaskDataset(TaskDataset, Dataset): - """The task dataset base class for all the torch-based task processors. - - This base class is enough for most cases, except there are procedures which can not be executed in - preprocessors and Datasets like dataset merging. - """ - - def __init__(self, - datasets: Union[Any, List[Any]], - mode, - preprocessor=None, - **kwargs): - TaskDataset.__init__(self, datasets, mode, preprocessor, **kwargs) - self.trainer = None - - def __getitem__(self, index) -> Any: - return self.prepare_sample(self._inner_dataset[index]) - - def __len__(self): - return len(self._inner_dataset) - - def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any: - """Prepare a dataset. - - User can process the input datasets in a whole dataset perspective. - This method gives a default implementation of datasets merging, user can override this - method to write custom logics. - - Args: - datasets: The original dataset(s) - - Returns: A single dataset, which may be created after merging. - - """ - if isinstance(datasets, List): - if len(datasets) == 1: - return datasets[0] - elif len(datasets) > 1: - return ConcatDataset(datasets) - else: - return datasets - - def prepare_sample(self, data): - """Preprocess the data fetched from the inner_dataset. - - If the preprocessor is None, the original data will be returned, else the preprocessor will be called. - User can override this method to implement custom logics. - - Args: - data: The data fetched from the dataset. - - Returns: The processed data. - - """ - return self.preprocessor( - data) if self.preprocessor is not None else data diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py index 4c80af7d..dde044d5 100644 --- a/modelscope/msdatasets/utils/dataset_utils.py +++ b/modelscope/msdatasets/utils/dataset_utils.py @@ -184,9 +184,11 @@ def get_dataset_files(subset_split_into: dict, meta_map = defaultdict(dict) file_map = defaultdict(dict) args_map = defaultdict(dict) + custom_type_map = defaultdict(dict) modelscope_api = HubApi() for split, info in subset_split_into.items(): + custom_type_map[split] = info.get('custom', '') meta_map[split] = modelscope_api.get_dataset_file_url( info.get('meta', ''), dataset_name, namespace, revision) if info.get('file'): @@ -221,4 +223,4 @@ def get_dataset_files(subset_split_into: dict, if contains_dir(file_map): file_map = get_split_objects_map(file_map, objects) - return meta_map, file_map, args_map + return meta_map, file_map, args_map, custom_type_map diff --git a/modelscope/trainers/audio/kws_farfield_trainer.py b/modelscope/trainers/audio/kws_farfield_trainer.py index 508517a7..205947b7 100644 --- a/modelscope/trainers/audio/kws_farfield_trainer.py +++ b/modelscope/trainers/audio/kws_farfield_trainer.py @@ -12,7 +12,8 @@ from torch import optim as optim from modelscope.metainfo import Trainers from modelscope.models import Model, TorchModel -from modelscope.msdatasets.task_datasets.audio import KWSDataLoader, KWSDataset +from modelscope.msdatasets.dataset_cls.custom_datasets.audio import ( + KWSDataLoader, KWSDataset) from modelscope.trainers.base import BaseTrainer from modelscope.trainers.builder import TRAINERS from modelscope.utils.audio.audio_utils import update_conf diff --git a/modelscope/trainers/audio/kws_nearfield_trainer.py b/modelscope/trainers/audio/kws_nearfield_trainer.py index bf00c435..5e63e87e 100644 --- a/modelscope/trainers/audio/kws_nearfield_trainer.py +++ b/modelscope/trainers/audio/kws_nearfield_trainer.py @@ -1,42 +1,30 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import copy import datetime -import math import os -import random import re -import sys -from shutil import copyfile from typing import Callable, Dict, Optional -import numpy as np import torch -import torch.distributed as dist -import torch.nn.functional as F import yaml from tensorboardX import SummaryWriter from torch import nn as nn from torch import optim as optim -from torch.distributed import ReduceOp -from torch.nn.utils import clip_grad_norm_ from torch.utils.data import DataLoader from modelscope.metainfo import Trainers from modelscope.models import Model, TorchModel -from modelscope.msdatasets.task_datasets.audio.kws_nearfield_dataset import \ +from modelscope.msdatasets.dataset_cls.custom_datasets.audio.kws_nearfield_dataset import \ kws_nearfield_dataset from modelscope.trainers.base import BaseTrainer from modelscope.trainers.builder import TRAINERS -from modelscope.utils.audio.audio_utils import update_conf from modelscope.utils.checkpoint import load_checkpoint, save_checkpoint from modelscope.utils.config import Config from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile -from modelscope.utils.data_utils import to_device from modelscope.utils.device import create_device from modelscope.utils.logger import get_logger from modelscope.utils.torch_utils import (get_dist_info, get_local_rank, - init_dist, is_master, - set_random_seed) + init_dist, set_random_seed) from .kws_utils.batch_utils import executor_cv, executor_test, executor_train from .kws_utils.det_utils import compute_det from .kws_utils.file_utils import query_tokens_id, read_lexicon, read_token diff --git a/modelscope/trainers/cv/image_detection_damoyolo_trainer.py b/modelscope/trainers/cv/image_detection_damoyolo_trainer.py index 734c8915..c8081ee0 100644 --- a/modelscope/trainers/cv/image_detection_damoyolo_trainer.py +++ b/modelscope/trainers/cv/image_detection_damoyolo_trainer.py @@ -1,11 +1,9 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import copy import datetime import math import os -import os.path as osp import time -from typing import Callable, Dict, Optional +from typing import Dict import torch import torch.distributed as dist @@ -25,8 +23,8 @@ from modelscope.models.cv.tinynas_detection.damo.detectors.detector import ( build_ddp_model, build_local_model) from modelscope.models.cv.tinynas_detection.damo.utils import ( cosine_scheduler, ema_model) -from modelscope.msdatasets.task_datasets.damoyolo import (build_dataloader, - build_dataset) +from modelscope.msdatasets.dataset_cls.custom_datasets.damoyolo import ( + build_dataloader, build_dataset) from modelscope.trainers.base import BaseTrainer from modelscope.trainers.builder import TRAINERS from modelscope.utils.checkpoint import save_checkpoint diff --git a/modelscope/trainers/cv/ocr_detection_db_trainer.py b/modelscope/trainers/cv/ocr_detection_db_trainer.py index 2967ffb0..3a9d51aa 100644 --- a/modelscope/trainers/cv/ocr_detection_db_trainer.py +++ b/modelscope/trainers/cv/ocr_detection_db_trainer.py @@ -19,10 +19,8 @@ from modelscope.models.cv.ocr_detection.modules.dbnet import (DBModel, DBModel_v2) from modelscope.models.cv.ocr_detection.utils import (boxes_from_bitmap, polygons_from_bitmap) -from modelscope.msdatasets.task_datasets.ocr_detection import (DataLoader, - ImageDataset) -from modelscope.msdatasets.task_datasets.ocr_detection.measures import \ - QuadMeasurer +from modelscope.msdatasets.dataset_cls.custom_datasets.ocr_detection import ( + DataLoader, ImageDataset, QuadMeasurer) from modelscope.trainers.base import BaseTrainer from modelscope.trainers.builder import TRAINERS from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile diff --git a/modelscope/trainers/nlp/siamese_uie_trainer.py b/modelscope/trainers/nlp/siamese_uie_trainer.py index af95006e..e3289976 100644 --- a/modelscope/trainers/nlp/siamese_uie_trainer.py +++ b/modelscope/trainers/nlp/siamese_uie_trainer.py @@ -106,17 +106,21 @@ class SiameseUIETrainer(EpochBasedTrainer): seed=seed, **kwargs) - def to_task_dataset(self, - datasets: Union[Dataset, List[Dataset]], - mode: str, - task_data_config: Config = None, - preprocessor: Optional[Preprocessor] = None, - **kwargs): - if mode == 'train': + def build_dataset(self, + datasets: Union[torch.utils.data.Dataset, MsDataset, + List[torch.utils.data.Dataset]], + model_cfg: Config, + mode: str, + preprocessor: Optional[Preprocessor] = None, + **kwargs): + if mode == ModeKeys.TRAIN: datasets = self.load_dataset(datasets) - # print('****self.train_dataset*******', self.train_dataset[0]) - return super().to_task_dataset(datasets, mode, task_data_config, - preprocessor, **kwargs) + return super(SiameseUIETrainer, self).build_dataset( + datasets=datasets, + model_cfg=self.cfg, + mode=mode, + preprocessor=preprocessor, + **kwargs) def get_train_dataloader(self): """ Builder torch dataloader for training. @@ -125,12 +129,6 @@ class SiameseUIETrainer(EpochBasedTrainer): the config for data.train in configuration file, or subclass and override this method (or `get_train_dataloader` in a subclass. """ - if self.train_dataset is None: - train_data = self.cfg.dataset.train - self.train_dataset = self.build_dataset( - train_data, - mode=ModeKeys.TRAIN, - preprocessor=self.train_preprocessor) self.train_dataset.preprocessor = None data_loader = self._build_dataloader_with_dataset( self.train_dataset, diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index bbdd080f..455fc907 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -150,7 +150,7 @@ class VecoTrainer(NlpEpochBasedTrainer): """Veco evaluates the datasets one by one. """ - from modelscope.msdatasets.task_datasets import VecoDataset + from modelscope.msdatasets.dataset_cls.custom_datasets import VecoDataset if checkpoint_path is not None: from modelscope.trainers.hooks import LoadCheckpointHook LoadCheckpointHook.load_checkpoint(checkpoint_path, self) @@ -159,9 +159,10 @@ class VecoTrainer(NlpEpochBasedTrainer): metric_values = {} if self.eval_dataset is None: - val_data = self.cfg.dataset.val - self.eval_dataset = self.build_dataset( - val_data, mode=ModeKeys.EVAL) + self.eval_dataset = self.build_dataset_from_cfg( + model_cfg=self.cfg, + mode=self._mode, + preprocessor=self.eval_preprocessor) idx = 0 dataset_cnt = 1 diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 8cddbeae..7779b1a5 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -20,10 +20,11 @@ from modelscope.metrics import build_metric, task_default_metrics from modelscope.metrics.prediction_saving_wrapper import \ PredictionSavingWrapper from modelscope.models.base import Model, TorchModel +from modelscope.msdatasets.dataset_cls.custom_datasets import \ + TorchCustomDataset +from modelscope.msdatasets.dataset_cls.custom_datasets.builder import \ + build_custom_dataset from modelscope.msdatasets.ms_dataset import MsDataset -from modelscope.msdatasets.task_datasets.builder import build_task_dataset -from modelscope.msdatasets.task_datasets.torch_base_dataset import \ - TorchTaskDataset from modelscope.outputs import ModelOutputBase from modelscope.preprocessors.base import Preprocessor from modelscope.trainers.hooks.builder import HOOKS @@ -126,6 +127,10 @@ class EpochBasedTrainer(BaseTrainer): self._stop_training = False self._compile = kwargs.get('compile', False) + self.train_dataloader = None + self.eval_dataloader = None + self.data_loader = None + if isinstance(model, str): third_party = kwargs.get(ThirdParty.KEY) if third_party is not None: @@ -178,6 +183,21 @@ class EpochBasedTrainer(BaseTrainer): self.logger = get_logger( log_file=log_file, log_level=self.cfg.get('log_level', 'INFO')) + # Get train datasets + self.train_dataset = self.build_dataset( + datasets=train_dataset, + model_cfg=self.cfg, + mode=ModeKeys.TRAIN, + preprocessor=self.train_preprocessor, + **kwargs) + # Get evaluation datasets + self.eval_dataset = self.build_dataset( + datasets=eval_dataset, + model_cfg=self.cfg, + mode=ModeKeys.EVAL, + preprocessor=self.eval_preprocessor, + **kwargs) + self.train_data_collator, self.eval_data_collator = self.get_data_collator( data_collator, remove_unused_data=kwargs.get('remove_unused_data', False)) @@ -226,19 +246,6 @@ class EpochBasedTrainer(BaseTrainer): self._dist = self.is_dp_group_available() and dist.get_world_size( self.dp_group) > 1 - self.train_dataset = self.to_task_dataset( - train_dataset, - mode=ModeKeys.TRAIN, - task_data_config=self.cfg.safe_get('dataset.train'), - preprocessor=self.train_preprocessor, - **kwargs) - self.eval_dataset = self.to_task_dataset( - eval_dataset, - mode=ModeKeys.EVAL, - task_data_config=self.cfg.safe_get('dataset.val'), - preprocessor=self.eval_preprocessor, - **kwargs) - self.metrics = self.get_metrics() if not self.parallel_groups: @@ -466,85 +473,108 @@ class EpochBasedTrainer(BaseTrainer): else: return _get_data_len(self.eval_dataloader) - def to_task_dataset(self, - datasets: Union[Dataset, List[Dataset]], - mode: str, - task_data_config: Config = None, - preprocessor: Optional[Preprocessor] = None, - **kwargs): - """Build the task specific dataset processor for this trainer. + def build_dataset(self, + datasets: Union[Dataset, MsDataset, List[Dataset]], + model_cfg: Config, + mode: str, + preprocessor: Optional[Preprocessor] = None, + **kwargs): + """Build input datasets by given model configuration and preprocessor. - Returns: The task dataset processor for the task. If no result for the very model-type and task, - the default TaskDataset will be returned. + Args: + datasets (Union[Dataset, MsDataset, List[Dataset]]): The input datasets. + model_cfg (Config): The model configuration. + mode (str): `train`, `eval` or `inference`. See modelscope.utils.constant.ModeKeys + preprocessor (Preprocessor, Optional): The preprocessor for input data samples. + + Returns: + Preprocessed datasets. """ try: - to_tensor = kwargs.get('to_tensor', True) if not datasets: - return datasets - if isinstance(datasets, TorchTaskDataset): + return EpochBasedTrainer.build_dataset_from_cfg( + model_cfg=model_cfg, mode=mode, preprocessor=preprocessor) + + if isinstance(datasets, TorchCustomDataset): return datasets elif isinstance(datasets, MsDataset): - if task_data_config is None: - # adapt to some special models - task_data_config = ConfigDict( - type=self.cfg.model.type) if hasattr( - self.cfg, ConfigFields.model) else ConfigDict( - type=None) - task_data_config.update(dict(mode=mode)) - return datasets.to_torch_dataset( - task_data_config=task_data_config, - task_name=self.cfg.task, - preprocessors=preprocessor, - to_tensor=to_tensor) + if not datasets.is_custom: + datasets.to_custom_dataset( + custom_cfg=model_cfg, + preprocessor=preprocessor, + mode=mode, + **kwargs) + return datasets.ds_instance elif isinstance(datasets, List) and isinstance( datasets[0], MsDataset): - if task_data_config is None: - # adapt to some special models - task_data_config = ConfigDict( - type=self.cfg.model.type) if hasattr( - self.cfg, ConfigFields.model) else ConfigDict( - type=None) - task_data_config.update(dict(mode=mode)) - datasets = [ - d.to_torch_dataset( - task_data_config=task_data_config, - task_name=self.cfg.task, - preprocessors=preprocessor, - to_tensor=to_tensor) for d in datasets - ] - cfg = ConfigDict( - type=self.cfg.model.type, mode=mode, datasets=datasets) - task_dataset = build_task_dataset(cfg, self.cfg.task) - task_dataset.trainer = self - return task_dataset + custom_datasets = [] + for dataset in datasets: + if not dataset.is_custom: + dataset.to_custom_dataset( + custom_cfg=model_cfg, + preprocessor=preprocessor, + mode=mode, + **kwargs) + custom_datasets.append(dataset.ds_instance) + torch_custom_dataset = TorchCustomDataset( + datasets=custom_datasets, + mode=mode, + preprocessor=None, + **kwargs) + torch_custom_dataset.trainer = self + return torch_custom_dataset else: - if task_data_config is None: + dataset_mode_key = 'train' if mode == ModeKeys.TRAIN else 'val' + data_config = model_cfg.safe_get(f'dataset.{dataset_mode_key}') + if data_config is None: # adapt to some special models - task_data_config = {} + data_config = {} # avoid add no str value datasets, preprocessors in cfg - task_data_build_config = ConfigDict( - type=self.cfg.model.type, + data_build_config = ConfigDict( + type=model_cfg.model.type, mode=mode, datasets=datasets, preprocessor=preprocessor) - task_data_build_config.update(task_data_config) - task_dataset = build_task_dataset(task_data_build_config, - self.cfg.task) - task_dataset.trainer = self - return task_dataset - except Exception: + data_build_config.update(data_config) + custom_dataset = build_custom_dataset(data_build_config, + model_cfg.task) + custom_dataset.trainer = self + return custom_dataset + except Exception as e: + print('** build_dataset error log:', e) if isinstance(datasets, (List, Tuple)) or preprocessor is not None: - task_dataset = TorchTaskDataset( + custom_dataset = TorchCustomDataset( datasets, mode=mode, preprocessor=preprocessor, - **(dict(type=self.cfg.model.type) if hasattr( - self.cfg, 'model') else {})) - task_dataset.trainer = self - return task_dataset + **(dict(type=model_cfg.model.type) if hasattr( + model_cfg, 'model') else {})) + custom_dataset.trainer = self + return custom_dataset else: return datasets + @staticmethod + def build_dataset_from_cfg(model_cfg: Config, + mode: str, + preprocessor: Preprocessor = None): + dataset = None + dataset_name = model_cfg.safe_get('dataset.name') + subset_name = model_cfg.safe_get('dataset.subset', default='default') + split_name = model_cfg.safe_get(f'dataset.split_{mode}') + if not dataset_name or not split_name: + return dataset + dataset = MsDataset.load( + dataset_name=dataset_name, + subset_name=subset_name, + split=split_name, + custom_cfg=model_cfg) + if not dataset.is_custom: + dataset.to_custom_dataset( + custom_cfg=model_cfg, preprocessor=preprocessor, mode=mode) + + return dataset.ds_instance + def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: """Build train and eval preprocessor. @@ -667,7 +697,7 @@ class EpochBasedTrainer(BaseTrainer): checkpoint_path, self, strict=strict) self.model.eval() self._mode = ModeKeys.EVAL - predict_dataloader = self.get_predict_data_loader(predict_datasets) + predict_dataloader = self.get_predict_dataloader(predict_datasets) metric_classes = [PredictionSavingWrapper(saving_fn=saving_fn)] for m in metric_classes: @@ -836,11 +866,7 @@ class EpochBasedTrainer(BaseTrainer): (or `get_train_dataloader` in a subclass. """ if self.train_dataset is None: - train_data = self.cfg.dataset.train - self.train_dataset = self.build_dataset( - train_data, - mode=ModeKeys.TRAIN, - preprocessor=self.train_preprocessor) + raise 'The train_dataset cannot be None.' data_loader = self._build_dataloader_with_dataset( self.train_dataset, dist=self._dist, @@ -857,11 +883,7 @@ class EpochBasedTrainer(BaseTrainer): pass """ if self.eval_dataset is None: - val_data = self.cfg.dataset.val - self.eval_dataset = self.build_dataset( - val_data, - mode=ModeKeys.EVAL, - preprocessor=self.eval_preprocessor) + raise 'The eval_dataset cannot be None.' default_config = {'shuffle': False} default_config.update(self.cfg.evaluation.get('dataloader', {})) @@ -873,15 +895,16 @@ class EpochBasedTrainer(BaseTrainer): **default_config) return data_loader - def get_predict_data_loader(self, predict_datasets: Union[Dataset, - List[Dataset]]): + def get_predict_dataloader(self, predict_datasets: Union[Dataset, + List[Dataset]]): """ Builder torch dataloader for prediction with the config of evaluation. Args: predict_datasets(Union[Dataset, List[Dataset]]): The datasets used to predict ground truth. """ - dataset = self.to_task_dataset( - predict_datasets, + dataset = self.build_dataset( + datasets=predict_datasets, + model_cfg=self.cfg, mode=ModeKeys.EVAL, preprocessor=self.eval_preprocessor) @@ -895,26 +918,6 @@ class EpochBasedTrainer(BaseTrainer): **default_config) return data_loader - def build_dataset(self, data_cfg, mode, preprocessor=None): - """ Build torch dataset object using data config - """ - # TODO: support MsDataset load for cv - if hasattr(data_cfg, 'name'): - dataset_name = data_cfg.pop('name') - dataset = MsDataset.load( - dataset_name=dataset_name, - **data_cfg, - ) - cfg = ConfigDict(type=self.cfg.model.type, mode=mode) - torch_dataset = dataset.to_torch_dataset( - task_data_config=cfg, - task_name=self.cfg.task, - preprocessors=preprocessor) - else: - torch_dataset = build_task_dataset(data_cfg, self.cfg.task) - dataset = self.to_task_dataset(torch_dataset, mode) - return dataset - def build_optimizer(self, cfg: ConfigDict, default_args: dict = None): try: return build_optimizer( diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py index 7f078467..76f15e56 100644 --- a/modelscope/utils/ast_utils.py +++ b/modelscope/utils/ast_utils.py @@ -16,7 +16,7 @@ import json from modelscope import __version__ from modelscope.fileio.file import LocalStorage -from modelscope.metainfo import (Datasets, Heads, Hooks, LR_Schedulers, +from modelscope.metainfo import (CustomDatasets, Heads, Hooks, LR_Schedulers, Metrics, Models, Optimizers, Pipelines, Preprocessors, TaskModels, Trainers) from modelscope.utils.constant import Fields, Tasks diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index bedafa0c..f2623db4 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -526,3 +526,8 @@ class DistributedParallelType(object): DP = 'data_parallel' TP = 'tensor_model_parallel' PP = 'pipeline_model_parallel' + + +class DatasetTensorflowConfig: + BATCH_SIZE = 'batch_size' + DEFAULT_BATCH_SIZE_VALUE = 5 diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py index 51074bca..8ded9a46 100644 --- a/tests/msdatasets/test_ms_dataset.py +++ b/tests/msdatasets/test_ms_dataset.py @@ -3,12 +3,16 @@ import hashlib import os import unittest +from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.msdatasets import MsDataset -from modelscope.msdatasets.audio.asr_dataset import ASRDataset +from modelscope.msdatasets.dataset_cls.custom_datasets.audio.asr_dataset import \ + ASRDataset from modelscope.preprocessors import TextClassificationTransformersPreprocessor from modelscope.preprocessors.base import Preprocessor -from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode +from modelscope.utils.config import Config +from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, DownloadMode, + ModelFile) from modelscope.utils.test_utils import require_tf, require_torch, test_level @@ -68,6 +72,7 @@ class MsDatasetTest(unittest.TestCase): ms_ds_train = MsDataset.load('movie_scene_seg_toydata', split='train') print(ms_ds_train._hf_ds.config_kwargs) assert next(iter(ms_ds_train.config_kwargs['split_config'].values())) + assert next(iter(ms_ds_train)) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_coco(self): @@ -260,6 +265,34 @@ class MsDatasetTest(unittest.TestCase): print(data_example) assert data_example.values() + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_to_custom_dataset_movie_scene_toydata(self): + from modelscope.msdatasets.dataset_cls.custom_datasets.movie_scene_segmentation import \ + MovieSceneSegmentationDataset + from modelscope.msdatasets.dataset_cls.dataset import ExternalDataset + + model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet' + cache_path = snapshot_download(model_id) + config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) + cfg = Config.from_file(config_path) + + # ds_test.ds_instance got object 'MovieSceneSegmentationDataset' when the custom_cfg is not none. + ds_test_1 = MsDataset.load( + 'modelscope/movie_scene_seg_toydata', + split='test', + custom_cfg=cfg, + test_mode=True) + assert ds_test_1.is_custom + assert isinstance(ds_test_1.ds_instance, MovieSceneSegmentationDataset) + + # ds_test.ds_instance got object 'ExternalDataset' when the custom_cfg is none. (by default) + ds_test_2 = MsDataset.load( + 'modelscope/movie_scene_seg_toydata', + split='test', + custom_cfg=None) + assert not ds_test_2.is_custom + assert isinstance(ds_test_2.ds_instance, ExternalDataset) + if __name__ == '__main__': unittest.main() diff --git a/tests/pipelines/test_movie_scene_segmentation.py b/tests/pipelines/test_movie_scene_segmentation.py index affd5140..0ac8b716 100644 --- a/tests/pipelines/test_movie_scene_segmentation.py +++ b/tests/pipelines/test_movie_scene_segmentation.py @@ -1,8 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import os +import tempfile import unittest +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.metainfo import Trainers +from modelscope.msdatasets import MsDataset from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks +from modelscope.trainers import build_trainer +from modelscope.utils.config import Config, ConfigDict +from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -13,6 +20,12 @@ class MovieSceneSegmentationTest(unittest.TestCase, DemoCompatibilityCheck): self.task = Tasks.movie_scene_segmentation self.model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet' + cache_path = snapshot_download(self.model_id) + config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) + self.cfg = Config.from_file(config_path) + + self.tmp_dir = tempfile.TemporaryDirectory().name + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_movie_scene_segmentation(self): input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4' @@ -24,6 +37,81 @@ class MovieSceneSegmentationTest(unittest.TestCase, DemoCompatibilityCheck): else: raise ValueError('process error') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_movie_scene_segmentation_finetune(self): + + train_data_cfg = ConfigDict( + name='movie_scene_seg_toydata', + split='train', + cfg=self.cfg.preprocessor, + test_mode=False) + + train_dataset = MsDataset.load( + dataset_name=train_data_cfg.name, + split=train_data_cfg.split, + cfg=train_data_cfg.cfg, + test_mode=train_data_cfg.test_mode) + + test_data_cfg = ConfigDict( + name='movie_scene_seg_toydata', + split='test', + cfg=self.cfg.preprocessor, + test_mode=True) + + test_dataset = MsDataset.load( + dataset_name=test_data_cfg.name, + split=test_data_cfg.split, + cfg=test_data_cfg.cfg, + test_mode=test_data_cfg.test_mode) + + kwargs = dict( + model=self.model_id, + train_dataset=train_dataset, + eval_dataset=test_dataset, + work_dir=self.tmp_dir) + + trainer = build_trainer( + name=Trainers.movie_scene_segmentation, default_args=kwargs) + trainer.train() + results_files = os.listdir(trainer.work_dir) + print(results_files) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_movie_scene_segmentation_finetune_with_custom_dataset(self): + + data_cfg = ConfigDict( + dataset_name='movie_scene_seg_toydata', + namespace='modelscope', + train_split='train', + test_split='test', + model_cfg=self.cfg) + + train_dataset = MsDataset.load( + dataset_name=data_cfg.dataset_name, + namespace=data_cfg.namespace, + split=data_cfg.train_split, + custom_cfg=data_cfg.model_cfg, + test_mode=False) + + test_dataset = MsDataset.load( + dataset_name=data_cfg.dataset_name, + namespace=data_cfg.namespace, + split=data_cfg.test_split, + custom_cfg=data_cfg.model_cfg, + test_mode=True) + + kwargs = dict( + model=self.model_id, + train_dataset=train_dataset, + eval_dataset=test_dataset, + work_dir=self.tmp_dir) + + trainer = build_trainer( + name=Trainers.movie_scene_segmentation, default_args=kwargs) + trainer.train() + results_files = os.listdir(trainer.work_dir) + print(results_files) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_movie_scene_segmentation_with_default_task(self): input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4' diff --git a/tests/run_analysis.py b/tests/run_analysis.py index d6a526ac..ca0a0018 100644 --- a/tests/run_analysis.py +++ b/tests/run_analysis.py @@ -259,7 +259,7 @@ def get_test_suites_to_run(): affected_trainer_cases.extend( model_trainer_map[model_id]) elif (affected_register_module[0] == 'HOOKS' - or affected_register_module[0] == 'TASK_DATASETS'): + or affected_register_module[0] == 'CUSTOM_DATASETS'): # ["HOOKS", "", "CheckpointHook", "CheckpointHook"] # ["HOOKS", "", hook_name, class_name] # HOOKS, DATASETS modify run all trainer cases diff --git a/tests/taskdataset/test_veco_dataset.py b/tests/taskdataset/test_veco_dataset.py index 76da1681..c220c363 100644 --- a/tests/taskdataset/test_veco_dataset.py +++ b/tests/taskdataset/test_veco_dataset.py @@ -2,7 +2,8 @@ import unittest -from modelscope.msdatasets.task_datasets.veco_dataset import VecoDataset +from modelscope.msdatasets.dataset_cls.custom_datasets.veco_dataset import \ + VecoDataset from modelscope.utils.test_utils import test_level diff --git a/tests/trainers/test_action_detection_trainer.py b/tests/trainers/test_action_detection_trainer.py index 7d0b401f..f2461ebb 100644 --- a/tests/trainers/test_action_detection_trainer.py +++ b/tests/trainers/test_action_detection_trainer.py @@ -43,7 +43,7 @@ class TestActionDetectionTrainer(unittest.TestCase): shutil.rmtree(self.tmp_dir) super().tearDown() - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_trainer(self): def cfg_modify_fn(cfg): diff --git a/tests/trainers/test_image_deblur_trainer.py b/tests/trainers/test_image_deblur_trainer.py index 6ae88726..f07db1bb 100644 --- a/tests/trainers/test_image_deblur_trainer.py +++ b/tests/trainers/test_image_deblur_trainer.py @@ -7,7 +7,7 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models.cv.image_deblur import NAFNetForImageDeblur from modelscope.msdatasets import MsDataset -from modelscope.msdatasets.task_datasets.gopro_image_deblurring_dataset import \ +from modelscope.msdatasets.dataset_cls.custom_datasets.gopro_image_deblurring_dataset import \ GoproImageDeblurringDataset from modelscope.trainers import build_trainer from modelscope.utils.config import Config diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py index 3b5882bd..e2b65b32 100644 --- a/tests/trainers/test_image_denoise_trainer.py +++ b/tests/trainers/test_image_denoise_trainer.py @@ -7,7 +7,7 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models.cv.image_denoise import NAFNetForImageDenoise from modelscope.msdatasets import MsDataset -from modelscope.msdatasets.task_datasets.sidd_image_denoising import \ +from modelscope.msdatasets.dataset_cls.custom_datasets.sidd_image_denoising import \ SiddImageDenoisingDataset from modelscope.trainers import build_trainer from modelscope.utils.config import Config diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py index 03f7eea3..923eca2c 100644 --- a/tests/trainers/test_image_instance_segmentation_trainer.py +++ b/tests/trainers/test_image_instance_segmentation_trainer.py @@ -11,8 +11,6 @@ from modelscope.metainfo import Trainers from modelscope.models.cv.image_instance_segmentation import \ CascadeMaskRCNNSwinModel from modelscope.msdatasets import MsDataset -from modelscope.msdatasets.task_datasets import \ - ImageInstanceSegmentationCocoDataset from modelscope.trainers import build_trainer from modelscope.utils.config import Config, ConfigDict from modelscope.utils.constant import DownloadMode, ModelFile diff --git a/tests/trainers/test_image_portrait_enhancement_trainer.py b/tests/trainers/test_image_portrait_enhancement_trainer.py index a9fc74cb..b556a13b 100644 --- a/tests/trainers/test_image_portrait_enhancement_trainer.py +++ b/tests/trainers/test_image_portrait_enhancement_trainer.py @@ -1,21 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os -import os.path as osp import shutil import tempfile import unittest -from typing import Callable, List, Optional, Tuple, Union - -import cv2 -import torch -from torch.utils import data as data from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Trainers from modelscope.models.cv.image_portrait_enhancement import \ ImagePortraitEnhancement from modelscope.msdatasets import MsDataset -from modelscope.msdatasets.task_datasets.image_portrait_enhancement import \ +from modelscope.msdatasets.dataset_cls.custom_datasets.image_portrait_enhancement import \ ImagePortraitEnhancementDataset from modelscope.trainers import build_trainer from modelscope.utils.constant import DownloadMode, ModelFile diff --git a/tests/trainers/test_language_guided_video_summarization_trainer.py b/tests/trainers/test_language_guided_video_summarization_trainer.py index 3ff0e102..2673e4b9 100644 --- a/tests/trainers/test_language_guided_video_summarization_trainer.py +++ b/tests/trainers/test_language_guided_video_summarization_trainer.py @@ -7,7 +7,7 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models.cv.language_guided_video_summarization import \ ClipItVideoSummarization -from modelscope.msdatasets.task_datasets import \ +from modelscope.msdatasets.dataset_cls.custom_datasets import \ LanguageGuidedVideoSummarizationDataset from modelscope.trainers import build_trainer from modelscope.utils.config import Config diff --git a/tests/trainers/test_siamese_uie_trainer.py b/tests/trainers/test_siamese_uie_trainer.py index c143c562..bf21ece9 100644 --- a/tests/trainers/test_siamese_uie_trainer.py +++ b/tests/trainers/test_siamese_uie_trainer.py @@ -16,8 +16,7 @@ class TestFinetuneSiameseUIE(unittest.TestCase): def setUp(self): print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) self.tmp_dir = tempfile.TemporaryDirectory().name - if not os.path.exists(self.tmp_dir): - os.makedirs(self.tmp_dir) + os.makedirs(self.tmp_dir, exist_ok=True) def tearDown(self): shutil.rmtree(self.tmp_dir) diff --git a/tests/trainers/test_tinynas_damoyolo_trainer.py b/tests/trainers/test_tinynas_damoyolo_trainer.py index d08980da..5dd9e928 100644 --- a/tests/trainers/test_tinynas_damoyolo_trainer.py +++ b/tests/trainers/test_tinynas_damoyolo_trainer.py @@ -1,18 +1,12 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import glob -import os -import shutil -import tempfile -import unittest -import torch +import os +import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Trainers from modelscope.trainers import build_trainer -from modelscope.utils.config import Config -from modelscope.utils.constant import ModelFile -from modelscope.utils.test_utils import DistributedTestCase, test_level +from modelscope.utils.test_utils import test_level def _setup(): diff --git a/tests/trainers/test_video_summarization_trainer.py b/tests/trainers/test_video_summarization_trainer.py index 1cea1eea..35eee2bc 100644 --- a/tests/trainers/test_video_summarization_trainer.py +++ b/tests/trainers/test_video_summarization_trainer.py @@ -6,7 +6,8 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models.cv.video_summarization import PGLVideoSummarization -from modelscope.msdatasets.task_datasets import VideoSummarizationDataset +from modelscope.msdatasets.dataset_cls.custom_datasets import \ + VideoSummarizationDataset from modelscope.trainers import build_trainer from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile @@ -17,6 +18,7 @@ logger = get_logger() class VideoSummarizationTrainerTest(unittest.TestCase): + # TODO: To be added to CUSTOM_DATASETS register def setUp(self): print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))