From 70c9fd322a2008b0b398d6da4be6dd17cbce594c Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Mon, 13 Feb 2023 16:54:02 +0000 Subject: [PATCH] [to #47563396]Fix bug: two ckpt hooks save in the same dir 1. Support two checkpoint hooks saving final checkpoints in two difference folders 2. Remove the check of checkpoint hooks 3. Fix a incorrect modification in UT 4. Fix bug: Checkpoint.load_checkpoint has been moved out 5. Add UT for new style configuration Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11630170 (cherry picked from commit 90af43f7497c43057ab8d6c68fa9b4b789044282) --- modelscope/models/base/base_torch_model.py | 6 +-- modelscope/preprocessors/base.py | 3 +- modelscope/trainers/default_config.py | 8 ---- modelscope/trainers/hooks/checkpoint_hook.py | 13 +++++- modelscope/utils/checkpoint.py | 3 ++ modelscope/utils/config.py | 5 +-- modelscope/utils/constant.py | 1 + tests/models/test_base_torch.py | 25 ++++++++++++ tests/preprocessors/test_nlp.py | 20 +++++++++ tests/trainers/test_trainer_with_nlp.py | 43 +++++++++++++++++++- 10 files changed, 108 insertions(+), 19 deletions(-) diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py index b23b37e4..c3c3d40c 100644 --- a/modelscope/models/base/base_torch_model.py +++ b/modelscope/models/base/base_torch_model.py @@ -121,8 +121,8 @@ class TorchModel(Model, torch.nn.Module): if config is None and hasattr(self, 'cfg'): config = self.cfg - if config is not None: - save_config_function(target_folder, config) - save_pretrained(self, target_folder, save_checkpoint_names, save_function, **kwargs) + + if config is not None: + save_config_function(target_folder, config) diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py index 3367282a..056add79 100644 --- a/modelscope/preprocessors/base.py +++ b/modelscope/preprocessors/base.py @@ -282,7 +282,6 @@ class Preprocessor(ABC): # TODO: for Sequence, need adapt to `mode` and `mode_dir` args, # and add mode for Compose or other plans raise NotImplementedError('Not supported yet!') - sub_cfg = deepcopy(sub_cfg) preprocessor = build_preprocessor(sub_cfg, field_name) else: @@ -313,7 +312,7 @@ class Preprocessor(ABC): preprocessor.mode = preprocessor_mode sub_cfg.pop('model_dir', None) if not hasattr(preprocessor, 'cfg'): - preprocessor.cfg = sub_cfg + preprocessor.cfg = cfg return preprocessor def save_pretrained(self, diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py index 6d9b8536..7619633f 100644 --- a/modelscope/trainers/default_config.py +++ b/modelscope/trainers/default_config.py @@ -59,14 +59,6 @@ def merge_cfg(cfg: Config): cfg: The input cfg to be merged into. """ cfg.merge_from_dict(DEFAULT_HOOKS_CONFIG, force=False) - # pop duplicate hook - check_list = [ - 'BestCkptSaverHook' == hook['type'] for hook in cfg.train['hooks'] - ] - if any(check_list): - cfg.train.hooks = list( - filter(lambda hook: hook['type'] != 'CheckpointHook', - cfg.train.hooks)) def merge_hooks(cfg: Config) -> List[Dict]: diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py index 096228e4..27f32556 100644 --- a/modelscope/trainers/hooks/checkpoint_hook.py +++ b/modelscope/trainers/hooks/checkpoint_hook.py @@ -28,6 +28,8 @@ class CheckpointHook(Hook): by_epoch (bool): Saving checkpoints by epoch or by iteration. save_optimizer (bool): Whether to save optimizer state dict. Default: True. save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir` + output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference. + Default 'output'. save_last (bool): Whether to save the last checkpoint. Default: True. max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything. If the number exceeding the limit, earlier checkpoints will be deleted first. @@ -40,6 +42,7 @@ class CheckpointHook(Hook): by_epoch=True, save_optimizer=True, save_dir=None, + output_sub_dir=ModelFile.TRAIN_OUTPUT_DIR, save_last=True, max_checkpoint_num=None, **kwargs): @@ -47,6 +50,7 @@ class CheckpointHook(Hook): self.by_epoch = by_epoch self.save_optimizer = save_optimizer self.save_dir = save_dir + self.output_sub_dir = output_sub_dir self.save_last = save_last self.rng_state = None self.max_checkpoint_num = None @@ -136,7 +140,7 @@ class CheckpointHook(Hook): self.history_checkpoints.append(ckpt_file) def _save_pretrained(self, trainer): - output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR) + output_dir = os.path.join(self.save_dir, self.output_sub_dir) from modelscope.trainers.parallel.utils import is_parallel if is_parallel(trainer.model): @@ -238,6 +242,8 @@ class BestCkptSaverHook(CheckpointHook): by_epoch (bool): Save best checkpoints by epoch or by iteration. save_optimizer (bool): Whether to save optimizer state dict. Default: True. save_dir (str): Output directory to save best checkpoint. + output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference. + Default 'output_best'. restore_best (bool): Whether to restore the best checkpoint after training. max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything. If the number exceeding the limit, checkpoints with worse metric will be deleted, which is judged by the @@ -253,6 +259,7 @@ class BestCkptSaverHook(CheckpointHook): by_epoch=True, save_optimizer=True, save_dir=None, + output_sub_dir=ModelFile.TRAIN_BEST_OUTPUT_DIR, save_file_name=None, restore_best=False, max_checkpoint_num=1, @@ -264,6 +271,7 @@ class BestCkptSaverHook(CheckpointHook): by_epoch=by_epoch, save_optimizer=save_optimizer, save_dir=save_dir, + output_sub_dir=output_sub_dir, max_checkpoint_num=max_checkpoint_num, **kwargs, ) @@ -375,7 +383,8 @@ class BestCkptSaverHook(CheckpointHook): def after_run(self, trainer): if self.restore_best: if is_master(): - self.load_checkpoint(self._best_ckpt_file, trainer) + LoadCheckpointHook.load_checkpoint(self._best_ckpt_file, + trainer) @HOOKS.register_module(module_name=Hooks.LoadCheckpointHook) diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py index d207ac10..c0455545 100644 --- a/modelscope/utils/checkpoint.py +++ b/modelscope/utils/checkpoint.py @@ -520,6 +520,9 @@ def load_task_model_checkpoint(model_to_load, def save_configuration(target_folder, config: Dict): + from modelscope.utils.config import Config + if isinstance(config, Config): + config = config.to_dict() if ConfigFields.pipeline not in config: config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]} cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder) diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py index aead318c..85cb8b77 100644 --- a/modelscope/utils/config.py +++ b/modelscope/utils/config.py @@ -364,11 +364,10 @@ class Config: assert type_field is not None, 'Getting object without an index from a list or tuple ' \ 'needs an valid `type_field` param.' _sub_cfg_dict = list( - filter(lambda sub: getattr(sub, type_field) == key, - _cfg_dict)) + filter(lambda sub: sub[type_field] == key, _cfg_dict)) _cfg_dict = _sub_cfg_dict[0] else: - _cfg_dict = getattr(_cfg_dict, key) + _cfg_dict = _cfg_dict[key] if val is not None: _cfg_dict = _cfg_dict[int(val)] return _cfg_dict diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 7f294e15..f67fe7bf 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -367,6 +367,7 @@ class ModelFile(object): ONNX_MODEL_FILE = 'model.onnx' LABEL_MAPPING = 'label_mapping.json' TRAIN_OUTPUT_DIR = 'output' + TRAIN_BEST_OUTPUT_DIR = 'output_best' TS_MODEL_FILE = 'model.ts' YAML_FILE = 'model.yaml' TOKENIZER_FOLDER = 'tokenizer' diff --git a/tests/models/test_base_torch.py b/tests/models/test_base_torch.py index c147259b..40d1413a 100644 --- a/tests/models/test_base_torch.py +++ b/tests/models/test_base_torch.py @@ -1,5 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile import unittest import numpy as np @@ -12,6 +15,16 @@ from modelscope.models.base import TorchModel class TorchBaseTest(unittest.TestCase): + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + def test_custom_model(self): class MyTorchModel(TorchModel): @@ -55,6 +68,18 @@ class TorchBaseTest(unittest.TestCase): self.assertEqual((1, 20, 2, 2), out.shape) self.assertTrue(np.all(out.detach().numpy() > (add_bias - 10))) + def test_save_pretrained(self): + model = TorchModel.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-tiny') + save_path = os.path.join(self.tmp_dir, 'test_save_pretrained') + model.save_pretrained( + save_path, save_checkpoint_names='pytorch_model.bin') + self.assertTrue( + os.path.isfile(os.path.join(save_path, 'pytorch_model.bin'))) + self.assertTrue( + os.path.isfile(os.path.join(save_path, 'configuration.json'))) + self.assertTrue(os.path.isfile(os.path.join(save_path, 'vocab.txt'))) + if __name__ == '__main__': unittest.main() diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py index d63660c8..86e127c1 100644 --- a/tests/preprocessors/test_nlp.py +++ b/tests/preprocessors/test_nlp.py @@ -1,5 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os.path +import shutil +import tempfile import unittest from modelscope.preprocessors import Preprocessor, build_preprocessor, nlp @@ -11,6 +13,16 @@ logger = get_logger() class NLPPreprocessorTest(unittest.TestCase): + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + def test_tokenize(self): cfg = dict(type='Tokenize', tokenizer_name='bert-base-cased') preprocessor = build_preprocessor(cfg, Fields.nlp) @@ -32,6 +44,14 @@ class NLPPreprocessorTest(unittest.TestCase): output['attention_mask'], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + def test_save_pretrained(self): + preprocessor = Preprocessor.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-tiny') + save_path = os.path.join(self.tmp_dir, 'test_save_pretrained') + preprocessor.save_pretrained(save_path) + self.assertTrue( + os.path.isfile(os.path.join(save_path, 'configuration.json'))) + def test_preprocessor_download(self): from modelscope.preprocessors.nlp.token_classification_preprocessor import TokenClassificationPreprocessorBase preprocessor: TokenClassificationPreprocessorBase = \ diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py index be84091a..24672cf4 100644 --- a/tests/trainers/test_trainer_with_nlp.py +++ b/tests/trainers/test_trainer_with_nlp.py @@ -34,7 +34,7 @@ class TestTrainerWithNlp(unittest.TestCase): split='train').to_hf_dataset().select(range(2)) def tearDown(self): - # shutil.rmtree(self.tmp_dir) + shutil.rmtree(self.tmp_dir) super().tearDown() @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @@ -145,6 +145,7 @@ class TestTrainerWithNlp(unittest.TestCase): 'by_epoch': False, 'metric_key': 'accuracy', 'max_checkpoint_num': 4, + 'restore_best': True, }, { 'type': 'TextLoggerHook', 'interval': 1 @@ -192,6 +193,7 @@ class TestTrainerWithNlp(unittest.TestCase): trainer.train() results_files = os.listdir(self.tmp_dir) + print(results_files) self.assertIn(f'{trainer.timestamp}.log.json', results_files) for i in [22, 24, 26, 28]: self.assertTrue( @@ -199,6 +201,13 @@ class TestTrainerWithNlp(unittest.TestCase): f'accuracy{i}.pth' in filename for filename in results_files ])) + self.assertTrue( + os.path.isfile( + os.path.join(self.tmp_dir, 'output', 'pytorch_model.bin'))) + self.assertTrue( + os.path.isfile( + os.path.join(self.tmp_dir, 'output_best', + 'pytorch_model.bin'))) @unittest.skip('skip for now before test is re-configured') def test_trainer_with_configured_datasets(self): @@ -306,6 +315,38 @@ class TestTrainerWithNlp(unittest.TestCase): trainer, 'trainer_continue_train', level='strict'): trainer.train(os.path.join(self.tmp_dir, 'iter_3.pth')) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_trainer_with_new_style_configuration(self): + tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + + def cfg_modify_fn(cfg): + cfg.train['checkpoint'] = { + # 保存最优metric对应的checkpoint + 'best': { + # 是否按照epoch进行保存,false为按照iter + 'by_epoch': True, + # 保存的间隔 + 'interval': 2, + # 保存checkpoint数量的最大值 + 'max_checkpoint_num': 2, + # 根据指定的指标判断当前checkpoint是否为历史最优 + 'metric_key': 'f1', + } + } + return cfg + + kwargs = dict( + model='damo/nlp_structbert_sentence-similarity_chinese-tiny', + train_dataset=self.dataset, + eval_dataset=self.dataset, + cfg_modify_fn=cfg_modify_fn, + work_dir=self.tmp_dir) + + trainer = build_trainer(default_args=kwargs) + trainer.train() + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_trainer_with_evaluation(self): tmp_dir = tempfile.TemporaryDirectory().name