[to #47563396]Fix bug: two ckpt hooks save in the same dir

1. Support two checkpoint hooks saving final checkpoints in two difference folders 2. Remove the check of checkpoint hooks 3. Fix a incorrect modification in UT 4. Fix bug: Checkpoint.load_checkpoint has been moved out 5. Add UT for new style configuration Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11630170 (cherry picked from commit 90af43f749)
2026-02-24 12:10:09 +01:00 · 2023-02-13 16:54:02 +00:00
parent 23b55225e3
commit 70c9fd322a
10 changed files with 108 additions and 19 deletions
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -121,8 +121,8 @@ class TorchModel(Model, torch.nn.Module):
        if config is None and hasattr(self, 'cfg'):
            config = self.cfg

-        if config is not None:
-            save_config_function(target_folder, config)
-
        save_pretrained(self, target_folder, save_checkpoint_names,
                        save_function, **kwargs)
+
+        if config is not None:
+            save_config_function(target_folder, config)
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -282,7 +282,6 @@ class Preprocessor(ABC):
                # TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
                # and add mode for Compose or other plans
                raise NotImplementedError('Not supported yet!')
-            sub_cfg = deepcopy(sub_cfg)

            preprocessor = build_preprocessor(sub_cfg, field_name)
        else:
@@ -313,7 +312,7 @@ class Preprocessor(ABC):
        preprocessor.mode = preprocessor_mode
        sub_cfg.pop('model_dir', None)
        if not hasattr(preprocessor, 'cfg'):
-            preprocessor.cfg = sub_cfg
+            preprocessor.cfg = cfg
        return preprocessor

    def save_pretrained(self,
--- a/modelscope/trainers/default_config.py
+++ b/modelscope/trainers/default_config.py
@@ -59,14 +59,6 @@ def merge_cfg(cfg: Config):
        cfg: The input cfg to be merged into.
    """
    cfg.merge_from_dict(DEFAULT_HOOKS_CONFIG, force=False)
-    # pop duplicate hook
-    check_list = [
-        'BestCkptSaverHook' == hook['type'] for hook in cfg.train['hooks']
-    ]
-    if any(check_list):
-        cfg.train.hooks = list(
-            filter(lambda hook: hook['type'] != 'CheckpointHook',
-                   cfg.train.hooks))


 def merge_hooks(cfg: Config) -> List[Dict]:
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -28,6 +28,8 @@ class CheckpointHook(Hook):
        by_epoch (bool): Saving checkpoints by epoch or by iteration.
        save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
        save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir`
+        output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference.
+            Default 'output'.
        save_last (bool): Whether to save the last checkpoint. Default: True.
        max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
            If the number exceeding the limit, earlier checkpoints will be deleted first.
@@ -40,6 +42,7 @@ class CheckpointHook(Hook):
                 by_epoch=True,
                 save_optimizer=True,
                 save_dir=None,
+                 output_sub_dir=ModelFile.TRAIN_OUTPUT_DIR,
                 save_last=True,
                 max_checkpoint_num=None,
                 **kwargs):
@@ -47,6 +50,7 @@ class CheckpointHook(Hook):
        self.by_epoch = by_epoch
        self.save_optimizer = save_optimizer
        self.save_dir = save_dir
+        self.output_sub_dir = output_sub_dir
        self.save_last = save_last
        self.rng_state = None
        self.max_checkpoint_num = None
@@ -136,7 +140,7 @@ class CheckpointHook(Hook):
                    self.history_checkpoints.append(ckpt_file)

    def _save_pretrained(self, trainer):
-        output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        output_dir = os.path.join(self.save_dir, self.output_sub_dir)
        from modelscope.trainers.parallel.utils import is_parallel

        if is_parallel(trainer.model):
@@ -238,6 +242,8 @@ class BestCkptSaverHook(CheckpointHook):
        by_epoch (bool): Save best checkpoints by epoch or by iteration.
        save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
        save_dir (str): Output directory to save best checkpoint.
+        output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference.
+            Default 'output_best'.
        restore_best (bool): Whether to restore the best checkpoint after training.
        max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
            If the number exceeding the limit, checkpoints with worse metric will be deleted, which is judged by the
@@ -253,6 +259,7 @@ class BestCkptSaverHook(CheckpointHook):
                 by_epoch=True,
                 save_optimizer=True,
                 save_dir=None,
+                 output_sub_dir=ModelFile.TRAIN_BEST_OUTPUT_DIR,
                 save_file_name=None,
                 restore_best=False,
                 max_checkpoint_num=1,
@@ -264,6 +271,7 @@ class BestCkptSaverHook(CheckpointHook):
            by_epoch=by_epoch,
            save_optimizer=save_optimizer,
            save_dir=save_dir,
+            output_sub_dir=output_sub_dir,
            max_checkpoint_num=max_checkpoint_num,
            **kwargs,
        )
@@ -375,7 +383,8 @@ class BestCkptSaverHook(CheckpointHook):
    def after_run(self, trainer):
        if self.restore_best:
            if is_master():
-                self.load_checkpoint(self._best_ckpt_file, trainer)
+                LoadCheckpointHook.load_checkpoint(self._best_ckpt_file,
+                                                   trainer)


@HOOKS.register_module(module_name=Hooks.LoadCheckpointHook)
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -520,6 +520,9 @@ def load_task_model_checkpoint(model_to_load,


 def save_configuration(target_folder, config: Dict):
+    from modelscope.utils.config import Config
+    if isinstance(config, Config):
+        config = config.to_dict()
    if ConfigFields.pipeline not in config:
        config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
    cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder)
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -364,11 +364,10 @@ class Config:
                    assert type_field is not None, 'Getting object without an index from a list or tuple ' \
                                                   'needs an valid `type_field` param.'
                    _sub_cfg_dict = list(
-                        filter(lambda sub: getattr(sub, type_field) == key,
-                               _cfg_dict))
+                        filter(lambda sub: sub[type_field] == key, _cfg_dict))
                    _cfg_dict = _sub_cfg_dict[0]
                else:
-                    _cfg_dict = getattr(_cfg_dict, key)
+                    _cfg_dict = _cfg_dict[key]
                if val is not None:
                    _cfg_dict = _cfg_dict[int(val)]
            return _cfg_dict
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -367,6 +367,7 @@ class ModelFile(object):
    ONNX_MODEL_FILE = 'model.onnx'
    LABEL_MAPPING = 'label_mapping.json'
    TRAIN_OUTPUT_DIR = 'output'
+    TRAIN_BEST_OUTPUT_DIR = 'output_best'
    TS_MODEL_FILE = 'model.ts'
    YAML_FILE = 'model.yaml'
    TOKENIZER_FOLDER = 'tokenizer'
--- a/tests/models/test_base_torch.py
+++ b/tests/models/test_base_torch.py
@@ -1,5 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

+import os
+import shutil
+import tempfile
 import unittest

 import numpy as np
@@ -12,6 +15,16 @@ from modelscope.models.base import TorchModel

 class TorchBaseTest(unittest.TestCase):

+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
    def test_custom_model(self):

        class MyTorchModel(TorchModel):
@@ -55,6 +68,18 @@ class TorchBaseTest(unittest.TestCase):
        self.assertEqual((1, 20, 2, 2), out.shape)
        self.assertTrue(np.all(out.detach().numpy() > (add_bias - 10)))

+    def test_save_pretrained(self):
+        model = TorchModel.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-tiny')
+        save_path = os.path.join(self.tmp_dir, 'test_save_pretrained')
+        model.save_pretrained(
+            save_path, save_checkpoint_names='pytorch_model.bin')
+        self.assertTrue(
+            os.path.isfile(os.path.join(save_path, 'pytorch_model.bin')))
+        self.assertTrue(
+            os.path.isfile(os.path.join(save_path, 'configuration.json')))
+        self.assertTrue(os.path.isfile(os.path.join(save_path, 'vocab.txt')))
+

 if __name__ == '__main__':
    unittest.main()
--- a/tests/preprocessors/test_nlp.py
+++ b/tests/preprocessors/test_nlp.py
@@ -1,5 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path
+import shutil
+import tempfile
 import unittest

 from modelscope.preprocessors import Preprocessor, build_preprocessor, nlp
@@ -11,6 +13,16 @@ logger = get_logger()

 class NLPPreprocessorTest(unittest.TestCase):

+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
    def test_tokenize(self):
        cfg = dict(type='Tokenize', tokenizer_name='bert-base-cased')
        preprocessor = build_preprocessor(cfg, Fields.nlp)
@@ -32,6 +44,14 @@ class NLPPreprocessorTest(unittest.TestCase):
            output['attention_mask'],
            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

+    def test_save_pretrained(self):
+        preprocessor = Preprocessor.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-tiny')
+        save_path = os.path.join(self.tmp_dir, 'test_save_pretrained')
+        preprocessor.save_pretrained(save_path)
+        self.assertTrue(
+            os.path.isfile(os.path.join(save_path, 'configuration.json')))
+
    def test_preprocessor_download(self):
        from modelscope.preprocessors.nlp.token_classification_preprocessor import TokenClassificationPreprocessorBase
        preprocessor: TokenClassificationPreprocessorBase = \
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -34,7 +34,7 @@ class TestTrainerWithNlp(unittest.TestCase):
            split='train').to_hf_dataset().select(range(2))

    def tearDown(self):
-        # shutil.rmtree(self.tmp_dir)
+        shutil.rmtree(self.tmp_dir)
        super().tearDown()

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -145,6 +145,7 @@ class TestTrainerWithNlp(unittest.TestCase):
            'by_epoch': False,
            'metric_key': 'accuracy',
            'max_checkpoint_num': 4,
+            'restore_best': True,
        }, {
            'type': 'TextLoggerHook',
            'interval': 1
@@ -192,6 +193,7 @@ class TestTrainerWithNlp(unittest.TestCase):
            trainer.train()

        results_files = os.listdir(self.tmp_dir)
+        print(results_files)
        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
        for i in [22, 24, 26, 28]:
            self.assertTrue(
@@ -199,6 +201,13 @@ class TestTrainerWithNlp(unittest.TestCase):
                    f'accuracy{i}.pth' in filename
                    for filename in results_files
                ]))
+        self.assertTrue(
+            os.path.isfile(
+                os.path.join(self.tmp_dir, 'output', 'pytorch_model.bin')))
+        self.assertTrue(
+            os.path.isfile(
+                os.path.join(self.tmp_dir, 'output_best',
+                             'pytorch_model.bin')))

    @unittest.skip('skip for now before test is re-configured')
    def test_trainer_with_configured_datasets(self):
@@ -306,6 +315,38 @@ class TestTrainerWithNlp(unittest.TestCase):
                trainer, 'trainer_continue_train', level='strict'):
            trainer.train(os.path.join(self.tmp_dir, 'iter_3.pth'))

+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_new_style_configuration(self):
+        tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        def cfg_modify_fn(cfg):
+            cfg.train['checkpoint'] = {
+                # 保存最优metric对应的checkpoint
+                'best': {
+                    # 是否按照epoch进行保存，false为按照iter
+                    'by_epoch': True,
+                    # 保存的间隔
+                    'interval': 2,
+                    # 保存checkpoint数量的最大值
+                    'max_checkpoint_num': 2,
+                    # 根据指定的指标判断当前checkpoint是否为历史最优
+                    'metric_key': 'f1',
+                }
+            }
+            return cfg
+
+        kwargs = dict(
+            model='damo/nlp_structbert_sentence-similarity_chinese-tiny',
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            cfg_modify_fn=cfg_modify_fn,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+
    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_trainer_with_evaluation(self):
        tmp_dir = tempfile.TemporaryDirectory().name