[to #47563396]Fix bug: two ckpt hooks save in the same dir

1. Support two checkpoint hooks saving final checkpoints in two difference folders
2. Remove the check of checkpoint hooks
3. Fix a incorrect modification in UT
4. Fix bug: Checkpoint.load_checkpoint has been moved out
5. Add UT for new style configuration
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11630170

(cherry picked from commit 90af43f749)
This commit is contained in:
yuze.zyz
2023-02-13 16:54:02 +00:00
committed by Zhicheng Zhang
parent 23b55225e3
commit 70c9fd322a
10 changed files with 108 additions and 19 deletions

View File

@@ -121,8 +121,8 @@ class TorchModel(Model, torch.nn.Module):
if config is None and hasattr(self, 'cfg'):
config = self.cfg
if config is not None:
save_config_function(target_folder, config)
save_pretrained(self, target_folder, save_checkpoint_names,
save_function, **kwargs)
if config is not None:
save_config_function(target_folder, config)

View File

@@ -282,7 +282,6 @@ class Preprocessor(ABC):
# TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
# and add mode for Compose or other plans
raise NotImplementedError('Not supported yet!')
sub_cfg = deepcopy(sub_cfg)
preprocessor = build_preprocessor(sub_cfg, field_name)
else:
@@ -313,7 +312,7 @@ class Preprocessor(ABC):
preprocessor.mode = preprocessor_mode
sub_cfg.pop('model_dir', None)
if not hasattr(preprocessor, 'cfg'):
preprocessor.cfg = sub_cfg
preprocessor.cfg = cfg
return preprocessor
def save_pretrained(self,

View File

@@ -59,14 +59,6 @@ def merge_cfg(cfg: Config):
cfg: The input cfg to be merged into.
"""
cfg.merge_from_dict(DEFAULT_HOOKS_CONFIG, force=False)
# pop duplicate hook
check_list = [
'BestCkptSaverHook' == hook['type'] for hook in cfg.train['hooks']
]
if any(check_list):
cfg.train.hooks = list(
filter(lambda hook: hook['type'] != 'CheckpointHook',
cfg.train.hooks))
def merge_hooks(cfg: Config) -> List[Dict]:

View File

@@ -28,6 +28,8 @@ class CheckpointHook(Hook):
by_epoch (bool): Saving checkpoints by epoch or by iteration.
save_optimizer (bool): Whether to save optimizer state dict. Default: True.
save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir`
output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference.
Default 'output'.
save_last (bool): Whether to save the last checkpoint. Default: True.
max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
If the number exceeding the limit, earlier checkpoints will be deleted first.
@@ -40,6 +42,7 @@ class CheckpointHook(Hook):
by_epoch=True,
save_optimizer=True,
save_dir=None,
output_sub_dir=ModelFile.TRAIN_OUTPUT_DIR,
save_last=True,
max_checkpoint_num=None,
**kwargs):
@@ -47,6 +50,7 @@ class CheckpointHook(Hook):
self.by_epoch = by_epoch
self.save_optimizer = save_optimizer
self.save_dir = save_dir
self.output_sub_dir = output_sub_dir
self.save_last = save_last
self.rng_state = None
self.max_checkpoint_num = None
@@ -136,7 +140,7 @@ class CheckpointHook(Hook):
self.history_checkpoints.append(ckpt_file)
def _save_pretrained(self, trainer):
output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR)
output_dir = os.path.join(self.save_dir, self.output_sub_dir)
from modelscope.trainers.parallel.utils import is_parallel
if is_parallel(trainer.model):
@@ -238,6 +242,8 @@ class BestCkptSaverHook(CheckpointHook):
by_epoch (bool): Save best checkpoints by epoch or by iteration.
save_optimizer (bool): Whether to save optimizer state dict. Default: True.
save_dir (str): Output directory to save best checkpoint.
output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference.
Default 'output_best'.
restore_best (bool): Whether to restore the best checkpoint after training.
max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
If the number exceeding the limit, checkpoints with worse metric will be deleted, which is judged by the
@@ -253,6 +259,7 @@ class BestCkptSaverHook(CheckpointHook):
by_epoch=True,
save_optimizer=True,
save_dir=None,
output_sub_dir=ModelFile.TRAIN_BEST_OUTPUT_DIR,
save_file_name=None,
restore_best=False,
max_checkpoint_num=1,
@@ -264,6 +271,7 @@ class BestCkptSaverHook(CheckpointHook):
by_epoch=by_epoch,
save_optimizer=save_optimizer,
save_dir=save_dir,
output_sub_dir=output_sub_dir,
max_checkpoint_num=max_checkpoint_num,
**kwargs,
)
@@ -375,7 +383,8 @@ class BestCkptSaverHook(CheckpointHook):
def after_run(self, trainer):
if self.restore_best:
if is_master():
self.load_checkpoint(self._best_ckpt_file, trainer)
LoadCheckpointHook.load_checkpoint(self._best_ckpt_file,
trainer)
@HOOKS.register_module(module_name=Hooks.LoadCheckpointHook)

View File

@@ -520,6 +520,9 @@ def load_task_model_checkpoint(model_to_load,
def save_configuration(target_folder, config: Dict):
from modelscope.utils.config import Config
if isinstance(config, Config):
config = config.to_dict()
if ConfigFields.pipeline not in config:
config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder)

View File

@@ -364,11 +364,10 @@ class Config:
assert type_field is not None, 'Getting object without an index from a list or tuple ' \
'needs an valid `type_field` param.'
_sub_cfg_dict = list(
filter(lambda sub: getattr(sub, type_field) == key,
_cfg_dict))
filter(lambda sub: sub[type_field] == key, _cfg_dict))
_cfg_dict = _sub_cfg_dict[0]
else:
_cfg_dict = getattr(_cfg_dict, key)
_cfg_dict = _cfg_dict[key]
if val is not None:
_cfg_dict = _cfg_dict[int(val)]
return _cfg_dict

View File

@@ -367,6 +367,7 @@ class ModelFile(object):
ONNX_MODEL_FILE = 'model.onnx'
LABEL_MAPPING = 'label_mapping.json'
TRAIN_OUTPUT_DIR = 'output'
TRAIN_BEST_OUTPUT_DIR = 'output_best'
TS_MODEL_FILE = 'model.ts'
YAML_FILE = 'model.yaml'
TOKENIZER_FOLDER = 'tokenizer'

View File

@@ -1,5 +1,8 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import shutil
import tempfile
import unittest
import numpy as np
@@ -12,6 +15,16 @@ from modelscope.models.base import TorchModel
class TorchBaseTest(unittest.TestCase):
def setUp(self):
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
self.tmp_dir = tempfile.TemporaryDirectory().name
if not os.path.exists(self.tmp_dir):
os.makedirs(self.tmp_dir)
def tearDown(self):
shutil.rmtree(self.tmp_dir)
super().tearDown()
def test_custom_model(self):
class MyTorchModel(TorchModel):
@@ -55,6 +68,18 @@ class TorchBaseTest(unittest.TestCase):
self.assertEqual((1, 20, 2, 2), out.shape)
self.assertTrue(np.all(out.detach().numpy() > (add_bias - 10)))
def test_save_pretrained(self):
model = TorchModel.from_pretrained(
'damo/nlp_structbert_sentence-similarity_chinese-tiny')
save_path = os.path.join(self.tmp_dir, 'test_save_pretrained')
model.save_pretrained(
save_path, save_checkpoint_names='pytorch_model.bin')
self.assertTrue(
os.path.isfile(os.path.join(save_path, 'pytorch_model.bin')))
self.assertTrue(
os.path.isfile(os.path.join(save_path, 'configuration.json')))
self.assertTrue(os.path.isfile(os.path.join(save_path, 'vocab.txt')))
if __name__ == '__main__':
unittest.main()

View File

@@ -1,5 +1,7 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path
import shutil
import tempfile
import unittest
from modelscope.preprocessors import Preprocessor, build_preprocessor, nlp
@@ -11,6 +13,16 @@ logger = get_logger()
class NLPPreprocessorTest(unittest.TestCase):
def setUp(self):
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
self.tmp_dir = tempfile.TemporaryDirectory().name
if not os.path.exists(self.tmp_dir):
os.makedirs(self.tmp_dir)
def tearDown(self):
shutil.rmtree(self.tmp_dir)
super().tearDown()
def test_tokenize(self):
cfg = dict(type='Tokenize', tokenizer_name='bert-base-cased')
preprocessor = build_preprocessor(cfg, Fields.nlp)
@@ -32,6 +44,14 @@ class NLPPreprocessorTest(unittest.TestCase):
output['attention_mask'],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
def test_save_pretrained(self):
preprocessor = Preprocessor.from_pretrained(
'damo/nlp_structbert_sentence-similarity_chinese-tiny')
save_path = os.path.join(self.tmp_dir, 'test_save_pretrained')
preprocessor.save_pretrained(save_path)
self.assertTrue(
os.path.isfile(os.path.join(save_path, 'configuration.json')))
def test_preprocessor_download(self):
from modelscope.preprocessors.nlp.token_classification_preprocessor import TokenClassificationPreprocessorBase
preprocessor: TokenClassificationPreprocessorBase = \

View File

@@ -34,7 +34,7 @@ class TestTrainerWithNlp(unittest.TestCase):
split='train').to_hf_dataset().select(range(2))
def tearDown(self):
# shutil.rmtree(self.tmp_dir)
shutil.rmtree(self.tmp_dir)
super().tearDown()
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -145,6 +145,7 @@ class TestTrainerWithNlp(unittest.TestCase):
'by_epoch': False,
'metric_key': 'accuracy',
'max_checkpoint_num': 4,
'restore_best': True,
}, {
'type': 'TextLoggerHook',
'interval': 1
@@ -192,6 +193,7 @@ class TestTrainerWithNlp(unittest.TestCase):
trainer.train()
results_files = os.listdir(self.tmp_dir)
print(results_files)
self.assertIn(f'{trainer.timestamp}.log.json', results_files)
for i in [22, 24, 26, 28]:
self.assertTrue(
@@ -199,6 +201,13 @@ class TestTrainerWithNlp(unittest.TestCase):
f'accuracy{i}.pth' in filename
for filename in results_files
]))
self.assertTrue(
os.path.isfile(
os.path.join(self.tmp_dir, 'output', 'pytorch_model.bin')))
self.assertTrue(
os.path.isfile(
os.path.join(self.tmp_dir, 'output_best',
'pytorch_model.bin')))
@unittest.skip('skip for now before test is re-configured')
def test_trainer_with_configured_datasets(self):
@@ -306,6 +315,38 @@ class TestTrainerWithNlp(unittest.TestCase):
trainer, 'trainer_continue_train', level='strict'):
trainer.train(os.path.join(self.tmp_dir, 'iter_3.pth'))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_trainer_with_new_style_configuration(self):
tmp_dir = tempfile.TemporaryDirectory().name
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
def cfg_modify_fn(cfg):
cfg.train['checkpoint'] = {
# 保存最优metric对应的checkpoint
'best': {
# 是否按照epoch进行保存false为按照iter
'by_epoch': True,
# 保存的间隔
'interval': 2,
# 保存checkpoint数量的最大值
'max_checkpoint_num': 2,
# 根据指定的指标判断当前checkpoint是否为历史最优
'metric_key': 'f1',
}
}
return cfg
kwargs = dict(
model='damo/nlp_structbert_sentence-similarity_chinese-tiny',
train_dataset=self.dataset,
eval_dataset=self.dataset,
cfg_modify_fn=cfg_modify_fn,
work_dir=self.tmp_dir)
trainer = build_trainer(default_args=kwargs)
trainer.train()
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_trainer_with_evaluation(self):
tmp_dir = tempfile.TemporaryDirectory().name