2022-09-20 17:49:31 +08:00
|
|
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
2023-03-01 20:13:31 +08:00
|
|
|
import hashlib
|
|
|
|
|
import os
|
2022-06-08 18:29:39 +08:00
|
|
|
import unittest
|
|
|
|
|
|
2023-03-10 09:03:32 +08:00
|
|
|
from modelscope.hub.snapshot_download import snapshot_download
|
2022-06-21 11:10:28 +08:00
|
|
|
from modelscope.models import Model
|
2022-06-27 11:09:38 +08:00
|
|
|
from modelscope.msdatasets import MsDataset
|
2023-03-10 09:03:32 +08:00
|
|
|
from modelscope.msdatasets.dataset_cls.custom_datasets.audio.asr_dataset import \
|
|
|
|
|
ASRDataset
|
[to #42322933] Refactor NLP and fix some user feedbacks
1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class
---------------------------- Another refactor from version 36 -------------------------
13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513
* add save_pretrained to preprocessor
* save preprocessor config in hook
* refactor label-id mapping fetching logic
* test ok on sentence-similarity
* run on finetuning
* fix bug
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/nlp/nlp_base.py
* add params to init
* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics
* Split trainer init impls to overridable methods
* remove some obsolete tokenizers
* unfinished
* support input params in pipeline
* fix bugs
* fix ut bug
* fix bug
* fix ut bug
* fix ut bug
* fix ut bug
* add base class for some preprocessors
* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config
* compatible with old code
* fix ut bug
* fix ut bugs
* fix bug
* add some comments
* fix ut bug
* add a requirement
* fix pre-commit
* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config
* fixbug
* Support function type in registry
* fix ut bug
* fix bug
* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/utils/hub.py
* remove obsolete file
* rename init args
* rename params
* fix merge bug
* add default preprocessor config for ner-model
* move a method a util file
* remove unused config
* Fix a bug in pbar
* bestckptsaver:change default ckpt numbers to 1
* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name
* Fix bug
* fix bug
* fix bug
* unfinished refactoring
* unfinished
* uw
* uw
* uw
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
# Conflicts:
# modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
# modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
# modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
# modelscope/preprocessors/nlp/text_generation_preprocessor.py
* uw
* uw
* unify nlp task outputs
* uw
* uw
* uw
* uw
* change the order of text cls pipeline
* refactor t5
* refactor tg task preprocessor
* fix
* unfinished
* temp
* refactor code
* unfinished
* unfinished
* unfinished
* unfinished
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
* smoke test pass
* ut testing
* pre-commit passed
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/nlp/bert/document_segmentation.py
# modelscope/pipelines/nlp/__init__.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
* merge master
* unifnished
* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config
* fix bug
* fix ut bug
* support ner batch inference
* fix ut bug
* fix bug
* support batch inference on three nlp tasks
* unfinished
* fix bug
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/base/base_model.py
# modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
# modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
# modelscope/pipelines/nlp/dialog_modeling_pipeline.py
# modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
# modelscope/pipelines/nlp/faq_question_answering_pipeline.py
# modelscope/pipelines/nlp/feature_extraction_pipeline.py
# modelscope/pipelines/nlp/fill_mask_pipeline.py
# modelscope/pipelines/nlp/information_extraction_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/sentence_embedding_pipeline.py
# modelscope/pipelines/nlp/summarization_pipeline.py
# modelscope/pipelines/nlp/table_question_answering_pipeline.py
# modelscope/pipelines/nlp/text2text_generation_pipeline.py
# modelscope/pipelines/nlp/text_classification_pipeline.py
# modelscope/pipelines/nlp/text_error_correction_pipeline.py
# modelscope/pipelines/nlp/text_generation_pipeline.py
# modelscope/pipelines/nlp/text_ranking_pipeline.py
# modelscope/pipelines/nlp/token_classification_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
# modelscope/trainers/nlp_trainer.py
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/__init__.py
* fix bug
* fix bug
* fix bug
* fix bug
* fix bug
* fixbug
* pre-commit passed
* fix bug
* fixbug
* fix bug
* fix bug
* fix bug
* fix bug
* self review done
* fixbug
* fix bug
* fix bug
* fix bugs
* remove sub-token offset mapping
* fix name bug
* add some tests
* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs
* add old logic back
* tmp save
* add tokenize by words logic back
* move outputs file back
* revert veco token-classification back
* fix typo
* Fix description
* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/builder.py
2022-11-30 23:52:17 +08:00
|
|
|
from modelscope.preprocessors import TextClassificationTransformersPreprocessor
|
2022-06-21 11:10:28 +08:00
|
|
|
from modelscope.preprocessors.base import Preprocessor
|
2023-03-10 09:03:32 +08:00
|
|
|
from modelscope.utils.config import Config
|
|
|
|
|
from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, DownloadMode,
|
|
|
|
|
ModelFile)
|
2022-06-21 11:10:28 +08:00
|
|
|
from modelscope.utils.test_utils import require_tf, require_torch, test_level
|
2022-06-08 18:29:39 +08:00
|
|
|
|
|
|
|
|
|
2022-06-21 11:10:28 +08:00
|
|
|
class ImgPreprocessor(Preprocessor):
|
|
|
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
self.path_field = kwargs.pop('image_path', 'image_path')
|
|
|
|
|
self.width = kwargs.pop('width', 'width')
|
|
|
|
|
self.height = kwargs.pop('height', 'width')
|
2022-06-08 18:29:39 +08:00
|
|
|
|
2022-06-21 11:10:28 +08:00
|
|
|
def __call__(self, data):
|
|
|
|
|
import cv2
|
|
|
|
|
image_path = data.get(self.path_field)
|
|
|
|
|
if not image_path:
|
|
|
|
|
return None
|
|
|
|
|
img = cv2.imread(image_path)
|
|
|
|
|
return {
|
|
|
|
|
'image':
|
|
|
|
|
cv2.resize(img,
|
|
|
|
|
(data.get(self.height, 128), data.get(self.width, 128)))
|
2022-06-08 18:29:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2023-03-01 20:13:31 +08:00
|
|
|
class GenLocalFile:
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def gen_mock_data() -> (str, str):
|
|
|
|
|
mock_data_list = [
|
|
|
|
|
'Title,Content,Label', 'mock title1,mock content1,mock label1',
|
|
|
|
|
'mock title2,mock content2,mock label2',
|
|
|
|
|
'mock title3,mock content3,mock label3'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
mock_file_name = 'mock_file.csv'
|
|
|
|
|
md = hashlib.md5()
|
|
|
|
|
md.update('GenLocalFile.gen_mock_data.out_file_path'.encode('utf-8'))
|
|
|
|
|
mock_dir = os.path.join(os.getcwd(), md.hexdigest())
|
|
|
|
|
os.makedirs(mock_dir, exist_ok=True)
|
|
|
|
|
mock_relative_path = os.path.join(md.hexdigest(), mock_file_name)
|
|
|
|
|
with open(mock_relative_path, 'w') as f:
|
|
|
|
|
for line in mock_data_list:
|
|
|
|
|
f.write(line + '\n')
|
|
|
|
|
|
|
|
|
|
return mock_relative_path, md.hexdigest()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def clear_mock_dir(mock_dir) -> None:
|
|
|
|
|
import shutil
|
|
|
|
|
shutil.rmtree(mock_dir)
|
|
|
|
|
|
|
|
|
|
|
2022-06-27 11:09:38 +08:00
|
|
|
class MsDatasetTest(unittest.TestCase):
|
2022-06-21 11:10:28 +08:00
|
|
|
|
2022-08-31 20:54:20 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
|
|
|
|
def test_movie_scene_seg_toydata(self):
|
|
|
|
|
ms_ds_train = MsDataset.load('movie_scene_seg_toydata', split='train')
|
|
|
|
|
print(ms_ds_train._hf_ds.config_kwargs)
|
|
|
|
|
assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))
|
2023-03-10 09:03:32 +08:00
|
|
|
assert next(iter(ms_ds_train))
|
2022-08-31 20:54:20 +08:00
|
|
|
|
2022-08-17 22:51:22 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
|
|
|
|
def test_coco(self):
|
|
|
|
|
ms_ds_train = MsDataset.load(
|
|
|
|
|
'pets_small',
|
2022-08-25 22:28:10 +08:00
|
|
|
namespace=DEFAULT_DATASET_NAMESPACE,
|
2022-08-30 15:15:15 +08:00
|
|
|
download_mode=DownloadMode.FORCE_REDOWNLOAD,
|
|
|
|
|
split='train')
|
2022-08-26 22:41:13 +08:00
|
|
|
print(ms_ds_train.config_kwargs)
|
|
|
|
|
assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))
|
2022-08-17 22:51:22 +08:00
|
|
|
|
2022-07-29 12:22:48 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
|
|
|
|
def test_ms_csv_basic(self):
|
|
|
|
|
ms_ds_train = MsDataset.load(
|
2022-10-20 15:29:34 +08:00
|
|
|
'clue', subset_name='afqmc',
|
|
|
|
|
split='train').to_hf_dataset().select(range(5))
|
2022-07-29 12:22:48 +08:00
|
|
|
print(next(iter(ms_ds_train)))
|
|
|
|
|
|
2023-03-01 20:13:31 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
|
|
|
|
def test_load_local_csv(self):
|
|
|
|
|
mock_relative_path, mock_dir_name = GenLocalFile.gen_mock_data()
|
|
|
|
|
# To test dataset_name in the form of `xxx/xxx.csv`
|
|
|
|
|
ds_from_single_file = MsDataset.load(mock_relative_path)
|
|
|
|
|
# To test dataset_name in the form of `xxx/`
|
|
|
|
|
ds_from_dir = MsDataset.load(mock_dir_name + '/')
|
|
|
|
|
|
|
|
|
|
GenLocalFile.clear_mock_dir(mock_dir_name)
|
|
|
|
|
ds_from_single_file_sample = next(iter(ds_from_single_file))
|
|
|
|
|
ds_from_dir_sample = next(iter(ds_from_dir))
|
|
|
|
|
|
|
|
|
|
print(ds_from_single_file_sample)
|
|
|
|
|
print(ds_from_dir_sample)
|
|
|
|
|
assert ds_from_single_file_sample
|
|
|
|
|
assert ds_from_dir_sample
|
|
|
|
|
|
2022-06-28 20:40:57 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
2022-06-21 11:10:28 +08:00
|
|
|
def test_ds_basic(self):
|
2022-07-20 16:38:15 +08:00
|
|
|
ms_ds_full = MsDataset.load(
|
|
|
|
|
'xcopa', subset_name='translation-et', namespace='damotest')
|
|
|
|
|
ms_ds = MsDataset.load(
|
|
|
|
|
'xcopa',
|
|
|
|
|
subset_name='translation-et',
|
|
|
|
|
namespace='damotest',
|
|
|
|
|
split='test')
|
|
|
|
|
print(next(iter(ms_ds_full['test'])))
|
|
|
|
|
print(next(iter(ms_ds)))
|
2022-06-21 11:10:28 +08:00
|
|
|
|
2022-06-28 20:40:57 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
2022-06-21 11:10:28 +08:00
|
|
|
@require_torch
|
|
|
|
|
def test_to_torch_dataset_text(self):
|
2022-10-22 23:25:18 +08:00
|
|
|
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
|
2022-06-21 11:10:28 +08:00
|
|
|
nlp_model = Model.from_pretrained(model_id)
|
[to #42322933] Refactor NLP and fix some user feedbacks
1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class
---------------------------- Another refactor from version 36 -------------------------
13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513
* add save_pretrained to preprocessor
* save preprocessor config in hook
* refactor label-id mapping fetching logic
* test ok on sentence-similarity
* run on finetuning
* fix bug
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/nlp/nlp_base.py
* add params to init
* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics
* Split trainer init impls to overridable methods
* remove some obsolete tokenizers
* unfinished
* support input params in pipeline
* fix bugs
* fix ut bug
* fix bug
* fix ut bug
* fix ut bug
* fix ut bug
* add base class for some preprocessors
* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config
* compatible with old code
* fix ut bug
* fix ut bugs
* fix bug
* add some comments
* fix ut bug
* add a requirement
* fix pre-commit
* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config
* fixbug
* Support function type in registry
* fix ut bug
* fix bug
* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/utils/hub.py
* remove obsolete file
* rename init args
* rename params
* fix merge bug
* add default preprocessor config for ner-model
* move a method a util file
* remove unused config
* Fix a bug in pbar
* bestckptsaver:change default ckpt numbers to 1
* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name
* Fix bug
* fix bug
* fix bug
* unfinished refactoring
* unfinished
* uw
* uw
* uw
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
# Conflicts:
# modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
# modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
# modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
# modelscope/preprocessors/nlp/text_generation_preprocessor.py
* uw
* uw
* unify nlp task outputs
* uw
* uw
* uw
* uw
* change the order of text cls pipeline
* refactor t5
* refactor tg task preprocessor
* fix
* unfinished
* temp
* refactor code
* unfinished
* unfinished
* unfinished
* unfinished
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
* smoke test pass
* ut testing
* pre-commit passed
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/nlp/bert/document_segmentation.py
# modelscope/pipelines/nlp/__init__.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
* merge master
* unifnished
* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config
* fix bug
* fix ut bug
* support ner batch inference
* fix ut bug
* fix bug
* support batch inference on three nlp tasks
* unfinished
* fix bug
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/base/base_model.py
# modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
# modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
# modelscope/pipelines/nlp/dialog_modeling_pipeline.py
# modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
# modelscope/pipelines/nlp/faq_question_answering_pipeline.py
# modelscope/pipelines/nlp/feature_extraction_pipeline.py
# modelscope/pipelines/nlp/fill_mask_pipeline.py
# modelscope/pipelines/nlp/information_extraction_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/sentence_embedding_pipeline.py
# modelscope/pipelines/nlp/summarization_pipeline.py
# modelscope/pipelines/nlp/table_question_answering_pipeline.py
# modelscope/pipelines/nlp/text2text_generation_pipeline.py
# modelscope/pipelines/nlp/text_classification_pipeline.py
# modelscope/pipelines/nlp/text_error_correction_pipeline.py
# modelscope/pipelines/nlp/text_generation_pipeline.py
# modelscope/pipelines/nlp/text_ranking_pipeline.py
# modelscope/pipelines/nlp/token_classification_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
# modelscope/trainers/nlp_trainer.py
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/__init__.py
* fix bug
* fix bug
* fix bug
* fix bug
* fix bug
* fixbug
* pre-commit passed
* fix bug
* fixbug
* fix bug
* fix bug
* fix bug
* fix bug
* self review done
* fixbug
* fix bug
* fix bug
* fix bugs
* remove sub-token offset mapping
* fix name bug
* add some tests
* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs
* add old logic back
* tmp save
* add tokenize by words logic back
* move outputs file back
* revert veco token-classification back
* fix typo
* Fix description
* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/builder.py
2022-11-30 23:52:17 +08:00
|
|
|
preprocessor = TextClassificationTransformersPreprocessor(
|
2022-06-21 11:10:28 +08:00
|
|
|
nlp_model.model_dir,
|
2022-07-20 16:38:15 +08:00
|
|
|
first_sequence='premise',
|
2022-09-27 23:08:33 +08:00
|
|
|
second_sequence=None,
|
|
|
|
|
padding='max_length')
|
2022-06-28 20:40:57 +08:00
|
|
|
ms_ds_train = MsDataset.load(
|
2022-07-20 16:38:15 +08:00
|
|
|
'xcopa',
|
|
|
|
|
subset_name='translation-et',
|
|
|
|
|
namespace='damotest',
|
|
|
|
|
split='test')
|
2022-06-21 11:10:28 +08:00
|
|
|
pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor)
|
|
|
|
|
import torch
|
|
|
|
|
dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
|
|
|
|
|
print(next(iter(dataloader)))
|
2022-06-08 18:29:39 +08:00
|
|
|
|
2022-06-28 20:40:57 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
2022-06-21 11:10:28 +08:00
|
|
|
@require_tf
|
|
|
|
|
def test_to_tf_dataset_text(self):
|
|
|
|
|
import tensorflow as tf
|
|
|
|
|
tf.compat.v1.enable_eager_execution()
|
2022-10-22 23:25:18 +08:00
|
|
|
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
|
2022-06-21 11:10:28 +08:00
|
|
|
nlp_model = Model.from_pretrained(model_id)
|
[to #42322933] Refactor NLP and fix some user feedbacks
1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class
---------------------------- Another refactor from version 36 -------------------------
13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513
* add save_pretrained to preprocessor
* save preprocessor config in hook
* refactor label-id mapping fetching logic
* test ok on sentence-similarity
* run on finetuning
* fix bug
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/nlp/nlp_base.py
* add params to init
* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics
* Split trainer init impls to overridable methods
* remove some obsolete tokenizers
* unfinished
* support input params in pipeline
* fix bugs
* fix ut bug
* fix bug
* fix ut bug
* fix ut bug
* fix ut bug
* add base class for some preprocessors
* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config
* compatible with old code
* fix ut bug
* fix ut bugs
* fix bug
* add some comments
* fix ut bug
* add a requirement
* fix pre-commit
* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config
* fixbug
* Support function type in registry
* fix ut bug
* fix bug
* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/utils/hub.py
* remove obsolete file
* rename init args
* rename params
* fix merge bug
* add default preprocessor config for ner-model
* move a method a util file
* remove unused config
* Fix a bug in pbar
* bestckptsaver:change default ckpt numbers to 1
* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name
* Fix bug
* fix bug
* fix bug
* unfinished refactoring
* unfinished
* uw
* uw
* uw
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
# Conflicts:
# modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
# modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
# modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
# modelscope/preprocessors/nlp/text_generation_preprocessor.py
* uw
* uw
* unify nlp task outputs
* uw
* uw
* uw
* uw
* change the order of text cls pipeline
* refactor t5
* refactor tg task preprocessor
* fix
* unfinished
* temp
* refactor code
* unfinished
* unfinished
* unfinished
* unfinished
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
* smoke test pass
* ut testing
* pre-commit passed
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/nlp/bert/document_segmentation.py
# modelscope/pipelines/nlp/__init__.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
* merge master
* unifnished
* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config
* fix bug
* fix ut bug
* support ner batch inference
* fix ut bug
* fix bug
* support batch inference on three nlp tasks
* unfinished
* fix bug
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/base/base_model.py
# modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
# modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
# modelscope/pipelines/nlp/dialog_modeling_pipeline.py
# modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
# modelscope/pipelines/nlp/faq_question_answering_pipeline.py
# modelscope/pipelines/nlp/feature_extraction_pipeline.py
# modelscope/pipelines/nlp/fill_mask_pipeline.py
# modelscope/pipelines/nlp/information_extraction_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/sentence_embedding_pipeline.py
# modelscope/pipelines/nlp/summarization_pipeline.py
# modelscope/pipelines/nlp/table_question_answering_pipeline.py
# modelscope/pipelines/nlp/text2text_generation_pipeline.py
# modelscope/pipelines/nlp/text_classification_pipeline.py
# modelscope/pipelines/nlp/text_error_correction_pipeline.py
# modelscope/pipelines/nlp/text_generation_pipeline.py
# modelscope/pipelines/nlp/text_ranking_pipeline.py
# modelscope/pipelines/nlp/token_classification_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
# modelscope/trainers/nlp_trainer.py
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/__init__.py
* fix bug
* fix bug
* fix bug
* fix bug
* fix bug
* fixbug
* pre-commit passed
* fix bug
* fixbug
* fix bug
* fix bug
* fix bug
* fix bug
* self review done
* fixbug
* fix bug
* fix bug
* fix bugs
* remove sub-token offset mapping
* fix name bug
* add some tests
* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs
* add old logic back
* tmp save
* add tokenize by words logic back
* move outputs file back
* revert veco token-classification back
* fix typo
* Fix description
* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/builder.py
2022-11-30 23:52:17 +08:00
|
|
|
preprocessor = TextClassificationTransformersPreprocessor(
|
2022-06-21 11:10:28 +08:00
|
|
|
nlp_model.model_dir,
|
2022-07-20 16:38:15 +08:00
|
|
|
first_sequence='premise',
|
2022-06-21 11:10:28 +08:00
|
|
|
second_sequence=None)
|
2022-06-28 20:40:57 +08:00
|
|
|
ms_ds_train = MsDataset.load(
|
2022-07-20 16:38:15 +08:00
|
|
|
'xcopa',
|
|
|
|
|
subset_name='translation-et',
|
|
|
|
|
namespace='damotest',
|
|
|
|
|
split='test')
|
2022-06-21 11:10:28 +08:00
|
|
|
tf_dataset = ms_ds_train.to_tf_dataset(
|
|
|
|
|
batch_size=5,
|
|
|
|
|
shuffle=True,
|
|
|
|
|
preprocessors=preprocessor,
|
|
|
|
|
drop_remainder=True)
|
|
|
|
|
print(next(iter(tf_dataset)))
|
2022-06-08 18:29:39 +08:00
|
|
|
|
2023-02-10 05:09:00 +00:00
|
|
|
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
|
|
|
|
def test_to_dataset_asr(self):
|
|
|
|
|
ms_ds_asr = ASRDataset.load(
|
|
|
|
|
'speech_asr_aishell1_trainsets', namespace='speech_asr')
|
|
|
|
|
print(next(iter(ms_ds_asr['train'])))
|
|
|
|
|
|
2022-06-23 16:55:48 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
2022-06-21 11:10:28 +08:00
|
|
|
@require_tf
|
|
|
|
|
def test_to_tf_dataset_img(self):
|
|
|
|
|
import tensorflow as tf
|
|
|
|
|
tf.compat.v1.enable_eager_execution()
|
2022-06-28 20:40:57 +08:00
|
|
|
ms_image_train = MsDataset.load(
|
2022-07-20 16:38:15 +08:00
|
|
|
'fixtures_image_utils', namespace='damotest', split='test')
|
2022-06-21 11:10:28 +08:00
|
|
|
tf_dataset = ms_image_train.to_tf_dataset(
|
|
|
|
|
batch_size=5,
|
|
|
|
|
shuffle=True,
|
2022-07-20 16:38:15 +08:00
|
|
|
preprocessors=ImgPreprocessor(image_path='file'),
|
2022-06-21 11:10:28 +08:00
|
|
|
drop_remainder=True,
|
2022-07-20 16:38:15 +08:00
|
|
|
)
|
2022-06-21 11:10:28 +08:00
|
|
|
print(next(iter(tf_dataset)))
|
2022-06-08 18:29:39 +08:00
|
|
|
|
2023-05-12 17:27:19 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
2023-01-13 14:57:16 +00:00
|
|
|
def test_streaming_load_uni_fold(self):
|
|
|
|
|
"""Test case for loading large scale datasets."""
|
|
|
|
|
dataset = MsDataset.load(
|
|
|
|
|
dataset_name='Uni-Fold-Data',
|
|
|
|
|
split='train',
|
|
|
|
|
use_streaming=True,
|
|
|
|
|
namespace='DPTech')
|
|
|
|
|
data_example = next(iter(dataset))
|
|
|
|
|
print(data_example)
|
|
|
|
|
assert data_example.values()
|
|
|
|
|
|
|
|
|
|
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
|
|
|
|
def test_streaming_load_afqmc(self):
|
|
|
|
|
"""To streaming-load afqmc dataset, which contains train/dev/validation data in meta-files."""
|
|
|
|
|
dataset = MsDataset.load('afqmc', split='test', use_streaming=True)
|
|
|
|
|
data_example = next(iter(dataset))
|
|
|
|
|
print(data_example)
|
|
|
|
|
assert data_example.values()
|
|
|
|
|
|
|
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
|
|
|
|
def test_streaming_load_from_hf(self):
|
|
|
|
|
"""Use stream mode to load dataset from huggingface hub."""
|
|
|
|
|
from modelscope.utils.constant import Hubs
|
|
|
|
|
ds_train = MsDataset.load(
|
|
|
|
|
'glue',
|
|
|
|
|
subset_name='sst2',
|
|
|
|
|
split='train',
|
|
|
|
|
hub=Hubs.huggingface,
|
|
|
|
|
use_streaming=True)
|
|
|
|
|
data_example = next(iter(ds_train))
|
|
|
|
|
print(data_example)
|
|
|
|
|
assert data_example.values()
|
|
|
|
|
|
2024-07-23 22:26:12 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 3, 'skip test in current test level')
|
2023-01-13 14:57:16 +00:00
|
|
|
def test_streaming_load_img_object(self):
|
|
|
|
|
"""Test case for iterating PIL object."""
|
|
|
|
|
from PIL.PngImagePlugin import PngImageFile
|
|
|
|
|
dataset = MsDataset.load(
|
|
|
|
|
dataset_name='SIDD',
|
|
|
|
|
subset_name='default',
|
|
|
|
|
namespace='huizheng',
|
|
|
|
|
split='train',
|
2024-07-23 22:26:12 +08:00
|
|
|
use_streaming=False)
|
2023-01-13 14:57:16 +00:00
|
|
|
data_example = next(iter(dataset))
|
|
|
|
|
print(data_example)
|
2023-05-24 19:48:20 +08:00
|
|
|
assert data_example.values()
|
2023-01-13 14:57:16 +00:00
|
|
|
|
2023-02-05 12:20:19 +00:00
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
2023-01-13 14:57:16 +00:00
|
|
|
def test_to_ms_dataset(self):
|
|
|
|
|
"""Test case for converting huggingface dataset to `MsDataset` instance."""
|
|
|
|
|
from datasets.load import load_dataset
|
2024-07-23 22:26:12 +08:00
|
|
|
hf_dataset = load_dataset(
|
|
|
|
|
'AI-Lab-Makerere/beans', split='train', streaming=True)
|
2023-01-13 14:57:16 +00:00
|
|
|
ms_dataset = MsDataset.to_ms_dataset(hf_dataset)
|
|
|
|
|
data_example = next(iter(ms_dataset))
|
|
|
|
|
print(data_example)
|
|
|
|
|
assert data_example.values()
|
|
|
|
|
|
2023-03-10 09:03:32 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
|
|
|
|
def test_to_custom_dataset_movie_scene_toydata(self):
|
|
|
|
|
from modelscope.msdatasets.dataset_cls.custom_datasets.movie_scene_segmentation import \
|
|
|
|
|
MovieSceneSegmentationDataset
|
2023-05-12 17:27:19 +08:00
|
|
|
from modelscope.msdatasets.dataset_cls import ExternalDataset
|
2023-03-10 09:03:32 +08:00
|
|
|
|
|
|
|
|
model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
|
|
|
|
|
cache_path = snapshot_download(model_id)
|
|
|
|
|
config_path = os.path.join(cache_path, ModelFile.CONFIGURATION)
|
|
|
|
|
cfg = Config.from_file(config_path)
|
|
|
|
|
|
|
|
|
|
# ds_test.ds_instance got object 'MovieSceneSegmentationDataset' when the custom_cfg is not none.
|
|
|
|
|
ds_test_1 = MsDataset.load(
|
|
|
|
|
'modelscope/movie_scene_seg_toydata',
|
|
|
|
|
split='test',
|
|
|
|
|
custom_cfg=cfg,
|
|
|
|
|
test_mode=True)
|
|
|
|
|
assert ds_test_1.is_custom
|
|
|
|
|
assert isinstance(ds_test_1.ds_instance, MovieSceneSegmentationDataset)
|
|
|
|
|
|
|
|
|
|
# ds_test.ds_instance got object 'ExternalDataset' when the custom_cfg is none. (by default)
|
|
|
|
|
ds_test_2 = MsDataset.load(
|
|
|
|
|
'modelscope/movie_scene_seg_toydata',
|
|
|
|
|
split='test',
|
|
|
|
|
custom_cfg=None)
|
|
|
|
|
assert not ds_test_2.is_custom
|
|
|
|
|
assert isinstance(ds_test_2.ds_instance, ExternalDataset)
|
|
|
|
|
|
2022-06-08 18:29:39 +08:00
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
unittest.main()
|