mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 12:10:09 +01:00
[to #47563396]Fix bug: two ckpt hooks save in the same dir
1. Support two checkpoint hooks saving final checkpoints in two difference folders
2. Remove the check of checkpoint hooks
3. Fix a incorrect modification in UT
4. Fix bug: Checkpoint.load_checkpoint has been moved out
5. Add UT for new style configuration
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11630170
(cherry picked from commit 90af43f749)
This commit is contained in:
@@ -121,8 +121,8 @@ class TorchModel(Model, torch.nn.Module):
|
||||
if config is None and hasattr(self, 'cfg'):
|
||||
config = self.cfg
|
||||
|
||||
if config is not None:
|
||||
save_config_function(target_folder, config)
|
||||
|
||||
save_pretrained(self, target_folder, save_checkpoint_names,
|
||||
save_function, **kwargs)
|
||||
|
||||
if config is not None:
|
||||
save_config_function(target_folder, config)
|
||||
|
||||
@@ -282,7 +282,6 @@ class Preprocessor(ABC):
|
||||
# TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
|
||||
# and add mode for Compose or other plans
|
||||
raise NotImplementedError('Not supported yet!')
|
||||
sub_cfg = deepcopy(sub_cfg)
|
||||
|
||||
preprocessor = build_preprocessor(sub_cfg, field_name)
|
||||
else:
|
||||
@@ -313,7 +312,7 @@ class Preprocessor(ABC):
|
||||
preprocessor.mode = preprocessor_mode
|
||||
sub_cfg.pop('model_dir', None)
|
||||
if not hasattr(preprocessor, 'cfg'):
|
||||
preprocessor.cfg = sub_cfg
|
||||
preprocessor.cfg = cfg
|
||||
return preprocessor
|
||||
|
||||
def save_pretrained(self,
|
||||
|
||||
@@ -59,14 +59,6 @@ def merge_cfg(cfg: Config):
|
||||
cfg: The input cfg to be merged into.
|
||||
"""
|
||||
cfg.merge_from_dict(DEFAULT_HOOKS_CONFIG, force=False)
|
||||
# pop duplicate hook
|
||||
check_list = [
|
||||
'BestCkptSaverHook' == hook['type'] for hook in cfg.train['hooks']
|
||||
]
|
||||
if any(check_list):
|
||||
cfg.train.hooks = list(
|
||||
filter(lambda hook: hook['type'] != 'CheckpointHook',
|
||||
cfg.train.hooks))
|
||||
|
||||
|
||||
def merge_hooks(cfg: Config) -> List[Dict]:
|
||||
|
||||
@@ -28,6 +28,8 @@ class CheckpointHook(Hook):
|
||||
by_epoch (bool): Saving checkpoints by epoch or by iteration.
|
||||
save_optimizer (bool): Whether to save optimizer state dict. Default: True.
|
||||
save_dir (str): The directory to save checkpoints. If is None, use `trainer.work_dir`
|
||||
output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference.
|
||||
Default 'output'.
|
||||
save_last (bool): Whether to save the last checkpoint. Default: True.
|
||||
max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
|
||||
If the number exceeding the limit, earlier checkpoints will be deleted first.
|
||||
@@ -40,6 +42,7 @@ class CheckpointHook(Hook):
|
||||
by_epoch=True,
|
||||
save_optimizer=True,
|
||||
save_dir=None,
|
||||
output_sub_dir=ModelFile.TRAIN_OUTPUT_DIR,
|
||||
save_last=True,
|
||||
max_checkpoint_num=None,
|
||||
**kwargs):
|
||||
@@ -47,6 +50,7 @@ class CheckpointHook(Hook):
|
||||
self.by_epoch = by_epoch
|
||||
self.save_optimizer = save_optimizer
|
||||
self.save_dir = save_dir
|
||||
self.output_sub_dir = output_sub_dir
|
||||
self.save_last = save_last
|
||||
self.rng_state = None
|
||||
self.max_checkpoint_num = None
|
||||
@@ -136,7 +140,7 @@ class CheckpointHook(Hook):
|
||||
self.history_checkpoints.append(ckpt_file)
|
||||
|
||||
def _save_pretrained(self, trainer):
|
||||
output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR)
|
||||
output_dir = os.path.join(self.save_dir, self.output_sub_dir)
|
||||
from modelscope.trainers.parallel.utils import is_parallel
|
||||
|
||||
if is_parallel(trainer.model):
|
||||
@@ -238,6 +242,8 @@ class BestCkptSaverHook(CheckpointHook):
|
||||
by_epoch (bool): Save best checkpoints by epoch or by iteration.
|
||||
save_optimizer (bool): Whether to save optimizer state dict. Default: True.
|
||||
save_dir (str): Output directory to save best checkpoint.
|
||||
output_sub_dir (str): The sub folder under the `save_dir` to save the output checkpoint for inference.
|
||||
Default 'output_best'.
|
||||
restore_best (bool): Whether to restore the best checkpoint after training.
|
||||
max_checkpoint_num (int): The max number of checkpoint files, default None which means never delete anything.
|
||||
If the number exceeding the limit, checkpoints with worse metric will be deleted, which is judged by the
|
||||
@@ -253,6 +259,7 @@ class BestCkptSaverHook(CheckpointHook):
|
||||
by_epoch=True,
|
||||
save_optimizer=True,
|
||||
save_dir=None,
|
||||
output_sub_dir=ModelFile.TRAIN_BEST_OUTPUT_DIR,
|
||||
save_file_name=None,
|
||||
restore_best=False,
|
||||
max_checkpoint_num=1,
|
||||
@@ -264,6 +271,7 @@ class BestCkptSaverHook(CheckpointHook):
|
||||
by_epoch=by_epoch,
|
||||
save_optimizer=save_optimizer,
|
||||
save_dir=save_dir,
|
||||
output_sub_dir=output_sub_dir,
|
||||
max_checkpoint_num=max_checkpoint_num,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -375,7 +383,8 @@ class BestCkptSaverHook(CheckpointHook):
|
||||
def after_run(self, trainer):
|
||||
if self.restore_best:
|
||||
if is_master():
|
||||
self.load_checkpoint(self._best_ckpt_file, trainer)
|
||||
LoadCheckpointHook.load_checkpoint(self._best_ckpt_file,
|
||||
trainer)
|
||||
|
||||
|
||||
@HOOKS.register_module(module_name=Hooks.LoadCheckpointHook)
|
||||
|
||||
@@ -520,6 +520,9 @@ def load_task_model_checkpoint(model_to_load,
|
||||
|
||||
|
||||
def save_configuration(target_folder, config: Dict):
|
||||
from modelscope.utils.config import Config
|
||||
if isinstance(config, Config):
|
||||
config = config.to_dict()
|
||||
if ConfigFields.pipeline not in config:
|
||||
config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
|
||||
cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder)
|
||||
|
||||
@@ -364,11 +364,10 @@ class Config:
|
||||
assert type_field is not None, 'Getting object without an index from a list or tuple ' \
|
||||
'needs an valid `type_field` param.'
|
||||
_sub_cfg_dict = list(
|
||||
filter(lambda sub: getattr(sub, type_field) == key,
|
||||
_cfg_dict))
|
||||
filter(lambda sub: sub[type_field] == key, _cfg_dict))
|
||||
_cfg_dict = _sub_cfg_dict[0]
|
||||
else:
|
||||
_cfg_dict = getattr(_cfg_dict, key)
|
||||
_cfg_dict = _cfg_dict[key]
|
||||
if val is not None:
|
||||
_cfg_dict = _cfg_dict[int(val)]
|
||||
return _cfg_dict
|
||||
|
||||
@@ -367,6 +367,7 @@ class ModelFile(object):
|
||||
ONNX_MODEL_FILE = 'model.onnx'
|
||||
LABEL_MAPPING = 'label_mapping.json'
|
||||
TRAIN_OUTPUT_DIR = 'output'
|
||||
TRAIN_BEST_OUTPUT_DIR = 'output_best'
|
||||
TS_MODEL_FILE = 'model.ts'
|
||||
YAML_FILE = 'model.yaml'
|
||||
TOKENIZER_FOLDER = 'tokenizer'
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -12,6 +15,16 @@ from modelscope.models.base import TorchModel
|
||||
|
||||
class TorchBaseTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
|
||||
self.tmp_dir = tempfile.TemporaryDirectory().name
|
||||
if not os.path.exists(self.tmp_dir):
|
||||
os.makedirs(self.tmp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmp_dir)
|
||||
super().tearDown()
|
||||
|
||||
def test_custom_model(self):
|
||||
|
||||
class MyTorchModel(TorchModel):
|
||||
@@ -55,6 +68,18 @@ class TorchBaseTest(unittest.TestCase):
|
||||
self.assertEqual((1, 20, 2, 2), out.shape)
|
||||
self.assertTrue(np.all(out.detach().numpy() > (add_bias - 10)))
|
||||
|
||||
def test_save_pretrained(self):
|
||||
model = TorchModel.from_pretrained(
|
||||
'damo/nlp_structbert_sentence-similarity_chinese-tiny')
|
||||
save_path = os.path.join(self.tmp_dir, 'test_save_pretrained')
|
||||
model.save_pretrained(
|
||||
save_path, save_checkpoint_names='pytorch_model.bin')
|
||||
self.assertTrue(
|
||||
os.path.isfile(os.path.join(save_path, 'pytorch_model.bin')))
|
||||
self.assertTrue(
|
||||
os.path.isfile(os.path.join(save_path, 'configuration.json')))
|
||||
self.assertTrue(os.path.isfile(os.path.join(save_path, 'vocab.txt')))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os.path
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from modelscope.preprocessors import Preprocessor, build_preprocessor, nlp
|
||||
@@ -11,6 +13,16 @@ logger = get_logger()
|
||||
|
||||
class NLPPreprocessorTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
|
||||
self.tmp_dir = tempfile.TemporaryDirectory().name
|
||||
if not os.path.exists(self.tmp_dir):
|
||||
os.makedirs(self.tmp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmp_dir)
|
||||
super().tearDown()
|
||||
|
||||
def test_tokenize(self):
|
||||
cfg = dict(type='Tokenize', tokenizer_name='bert-base-cased')
|
||||
preprocessor = build_preprocessor(cfg, Fields.nlp)
|
||||
@@ -32,6 +44,14 @@ class NLPPreprocessorTest(unittest.TestCase):
|
||||
output['attention_mask'],
|
||||
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
|
||||
|
||||
def test_save_pretrained(self):
|
||||
preprocessor = Preprocessor.from_pretrained(
|
||||
'damo/nlp_structbert_sentence-similarity_chinese-tiny')
|
||||
save_path = os.path.join(self.tmp_dir, 'test_save_pretrained')
|
||||
preprocessor.save_pretrained(save_path)
|
||||
self.assertTrue(
|
||||
os.path.isfile(os.path.join(save_path, 'configuration.json')))
|
||||
|
||||
def test_preprocessor_download(self):
|
||||
from modelscope.preprocessors.nlp.token_classification_preprocessor import TokenClassificationPreprocessorBase
|
||||
preprocessor: TokenClassificationPreprocessorBase = \
|
||||
|
||||
@@ -34,7 +34,7 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
split='train').to_hf_dataset().select(range(2))
|
||||
|
||||
def tearDown(self):
|
||||
# shutil.rmtree(self.tmp_dir)
|
||||
shutil.rmtree(self.tmp_dir)
|
||||
super().tearDown()
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
@@ -145,6 +145,7 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
'by_epoch': False,
|
||||
'metric_key': 'accuracy',
|
||||
'max_checkpoint_num': 4,
|
||||
'restore_best': True,
|
||||
}, {
|
||||
'type': 'TextLoggerHook',
|
||||
'interval': 1
|
||||
@@ -192,6 +193,7 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
trainer.train()
|
||||
|
||||
results_files = os.listdir(self.tmp_dir)
|
||||
print(results_files)
|
||||
self.assertIn(f'{trainer.timestamp}.log.json', results_files)
|
||||
for i in [22, 24, 26, 28]:
|
||||
self.assertTrue(
|
||||
@@ -199,6 +201,13 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
f'accuracy{i}.pth' in filename
|
||||
for filename in results_files
|
||||
]))
|
||||
self.assertTrue(
|
||||
os.path.isfile(
|
||||
os.path.join(self.tmp_dir, 'output', 'pytorch_model.bin')))
|
||||
self.assertTrue(
|
||||
os.path.isfile(
|
||||
os.path.join(self.tmp_dir, 'output_best',
|
||||
'pytorch_model.bin')))
|
||||
|
||||
@unittest.skip('skip for now before test is re-configured')
|
||||
def test_trainer_with_configured_datasets(self):
|
||||
@@ -306,6 +315,38 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
trainer, 'trainer_continue_train', level='strict'):
|
||||
trainer.train(os.path.join(self.tmp_dir, 'iter_3.pth'))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_trainer_with_new_style_configuration(self):
|
||||
tmp_dir = tempfile.TemporaryDirectory().name
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
|
||||
def cfg_modify_fn(cfg):
|
||||
cfg.train['checkpoint'] = {
|
||||
# 保存最优metric对应的checkpoint
|
||||
'best': {
|
||||
# 是否按照epoch进行保存,false为按照iter
|
||||
'by_epoch': True,
|
||||
# 保存的间隔
|
||||
'interval': 2,
|
||||
# 保存checkpoint数量的最大值
|
||||
'max_checkpoint_num': 2,
|
||||
# 根据指定的指标判断当前checkpoint是否为历史最优
|
||||
'metric_key': 'f1',
|
||||
}
|
||||
}
|
||||
return cfg
|
||||
|
||||
kwargs = dict(
|
||||
model='damo/nlp_structbert_sentence-similarity_chinese-tiny',
|
||||
train_dataset=self.dataset,
|
||||
eval_dataset=self.dataset,
|
||||
cfg_modify_fn=cfg_modify_fn,
|
||||
work_dir=self.tmp_dir)
|
||||
|
||||
trainer = build_trainer(default_args=kwargs)
|
||||
trainer.train()
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_trainer_with_evaluation(self):
|
||||
tmp_dir = tempfile.TemporaryDirectory().name
|
||||
|
||||
Reference in New Issue
Block a user