mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-16 16:27:45 +01:00
[to #42322933] lazy load on trainer
This commit is contained in:
@@ -145,11 +145,20 @@ class Trainers(object):
|
||||
For a model specific Trainer, you can use ${ModelName}-${Task}-trainer.
|
||||
"""
|
||||
|
||||
default = 'Trainer'
|
||||
default = 'trainer'
|
||||
|
||||
# multi-modal tasks
|
||||
# multi-modal trainers
|
||||
clip_multi_modal_embedding = 'clip-multi-modal-embedding'
|
||||
|
||||
# cv trainers
|
||||
image_instance_segmentation = 'image-instance-segmentation'
|
||||
image_portrait_enhancement = 'image-portrait-enhancement'
|
||||
|
||||
# nlp trainers
|
||||
bert_sentiment_analysis = 'bert-sentiment-analysis'
|
||||
nlp_base_trainer = 'nlp-base-trainer'
|
||||
nlp_veco_trainer = 'nlp-veco-trainer'
|
||||
|
||||
|
||||
class Preprocessors(object):
|
||||
""" Names for different preprocessor.
|
||||
@@ -219,3 +228,52 @@ class Metrics(object):
|
||||
image_color_enhance_metric = 'image-color-enhance-metric'
|
||||
# metrics for image-portrait-enhancement task
|
||||
image_portrait_enhancement_metric = 'image-portrait-enhancement-metric'
|
||||
|
||||
|
||||
class Optimizers(object):
|
||||
""" Names for different OPTIMIZER.
|
||||
|
||||
Holds the standard optimizer name to use for identifying different optimizer.
|
||||
This should be used to register optimizer.
|
||||
"""
|
||||
|
||||
default = 'optimizer'
|
||||
|
||||
SGD = 'SGD'
|
||||
|
||||
|
||||
class Hooks(object):
|
||||
""" Names for different hooks.
|
||||
|
||||
All kinds of hooks are defined here
|
||||
"""
|
||||
# lr
|
||||
LrSchedulerHook = 'LrSchedulerHook'
|
||||
PlateauLrSchedulerHook = 'PlateauLrSchedulerHook'
|
||||
NoneLrSchedulerHook = 'NoneLrSchedulerHook'
|
||||
|
||||
# optimizer
|
||||
OptimizerHook = 'OptimizerHook'
|
||||
TorchAMPOptimizerHook = 'TorchAMPOptimizerHook'
|
||||
ApexAMPOptimizerHook = 'ApexAMPOptimizerHook'
|
||||
NoneOptimizerHook = 'NoneOptimizerHook'
|
||||
|
||||
# checkpoint
|
||||
CheckpointHook = 'CheckpointHook'
|
||||
BestCkptSaverHook = 'BestCkptSaverHook'
|
||||
|
||||
# logger
|
||||
TextLoggerHook = 'TextLoggerHook'
|
||||
TensorboardHook = 'TensorboardHook'
|
||||
|
||||
IterTimerHook = 'IterTimerHook'
|
||||
EvaluationHook = 'EvaluationHook'
|
||||
|
||||
|
||||
class LR_Schedulers(object):
|
||||
"""learning rate scheduler is defined here
|
||||
|
||||
"""
|
||||
LinearWarmup = 'LinearWarmup'
|
||||
ConstantWarmup = 'ConstantWarmup'
|
||||
ExponentialWarmup = 'ExponentialWarmup'
|
||||
|
||||
@@ -1,8 +1,38 @@
|
||||
from .base import DummyTrainer
|
||||
from .builder import build_trainer
|
||||
from .cv import (ImageInstanceSegmentationTrainer,
|
||||
ImagePortraitEnhancementTrainer)
|
||||
from .multi_modal import CLIPTrainer
|
||||
from .nlp import SequenceClassificationTrainer
|
||||
from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
|
||||
from .trainer import EpochBasedTrainer
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base import DummyTrainer
|
||||
from .builder import build_trainer
|
||||
from .cv import (ImageInstanceSegmentationTrainer,
|
||||
ImagePortraitEnhancementTrainer)
|
||||
from .multi_modal import CLIPTrainer
|
||||
from .nlp import SequenceClassificationTrainer
|
||||
from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
|
||||
from .trainer import EpochBasedTrainer
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'base': ['DummyTrainer'],
|
||||
'builder': ['build_trainer'],
|
||||
'cv': [
|
||||
'ImageInstanceSegmentationTrainer',
|
||||
'ImagePortraitEnhancementTrainer'
|
||||
],
|
||||
'multi_modal': ['CLIPTrainer'],
|
||||
'nlp': ['SequenceClassificationTrainer'],
|
||||
'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
|
||||
'trainer': ['EpochBasedTrainer']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.utils.config import ConfigDict
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.registry import Registry, build_from_cfg
|
||||
@@ -8,7 +8,7 @@ TRAINERS = Registry('trainers')
|
||||
HOOKS = Registry('hooks')
|
||||
|
||||
|
||||
def build_trainer(name: str = 'EpochBasedTrainer', default_args: dict = None):
|
||||
def build_trainer(name: str = Trainers.default, default_args: dict = None):
|
||||
""" build trainer given a trainer name
|
||||
|
||||
Args:
|
||||
|
||||
@@ -1,3 +1,27 @@
|
||||
from .image_instance_segmentation_trainer import \
|
||||
ImageInstanceSegmentationTrainer
|
||||
from .image_portrait_enhancement_trainer import ImagePortraitEnhancementTrainer
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .image_instance_segmentation_trainer import \
|
||||
ImageInstanceSegmentationTrainer
|
||||
from .image_portrait_enhancement_trainer import ImagePortraitEnhancementTrainer
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'image_instance_segmentation_trainer':
|
||||
['ImageInstanceSegmentationTrainer'],
|
||||
'image_portrait_enhancement_trainer':
|
||||
['ImagePortraitEnhancementTrainer'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers.builder import TRAINERS
|
||||
from modelscope.trainers.trainer import EpochBasedTrainer
|
||||
|
||||
|
||||
@TRAINERS.register_module(module_name='image-instance-segmentation')
|
||||
@TRAINERS.register_module(module_name=Trainers.image_instance_segmentation)
|
||||
class ImageInstanceSegmentationTrainer(EpochBasedTrainer):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
@@ -4,6 +4,7 @@ from collections.abc import Mapping
|
||||
import torch
|
||||
from torch import distributed as dist
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers.builder import TRAINERS
|
||||
from modelscope.trainers.optimizer.builder import build_optimizer
|
||||
from modelscope.trainers.trainer import EpochBasedTrainer
|
||||
@@ -11,7 +12,7 @@ from modelscope.utils.constant import ModeKeys
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
|
||||
@TRAINERS.register_module(module_name='gpen')
|
||||
@TRAINERS.register_module(module_name=Trainers.image_portrait_enhancement)
|
||||
class ImagePortraitEnhancementTrainer(EpochBasedTrainer):
|
||||
|
||||
def train_step(self, model, inputs):
|
||||
|
||||
@@ -1,18 +1,42 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .builder import HOOKS, build_hook
|
||||
from .checkpoint_hook import BestCkptSaverHook, CheckpointHook
|
||||
from .evaluation_hook import EvaluationHook
|
||||
from .hook import Hook
|
||||
from .iter_timer_hook import IterTimerHook
|
||||
from .logger.text_logger_hook import TextLoggerHook
|
||||
from .lr_scheduler_hook import LrSchedulerHook
|
||||
from .optimizer_hook import (ApexAMPOptimizerHook, OptimizerHook,
|
||||
TorchAMPOptimizerHook)
|
||||
from .priority import Priority
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
__all__ = [
|
||||
'Hook', 'HOOKS', 'CheckpointHook', 'EvaluationHook', 'LrSchedulerHook',
|
||||
'OptimizerHook', 'Priority', 'build_hook', 'TextLoggerHook',
|
||||
'IterTimerHook', 'TorchAMPOptimizerHook', 'ApexAMPOptimizerHook',
|
||||
'BestCkptSaverHook', 'NoneOptimizerHook', 'NoneLrSchedulerHook'
|
||||
]
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .builder import HOOKS, build_hook
|
||||
from .checkpoint_hook import BestCkptSaverHook, CheckpointHook
|
||||
from .evaluation_hook import EvaluationHook
|
||||
from .hook import Hook
|
||||
from .iter_timer_hook import IterTimerHook
|
||||
from .logger import TextLoggerHook, TensorboardHook
|
||||
from .lr_scheduler_hook import LrSchedulerHook
|
||||
from .optimizer import (ApexAMPOptimizerHook, NoneOptimizerHook,
|
||||
OptimizerHook, TorchAMPOptimizerHook)
|
||||
from .priority import Priority, get_priority
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'builder': ['HOOKS', 'build_hook'],
|
||||
'checkpoint_hook': ['BestCkptSaverHook', 'CheckpointHook'],
|
||||
'evaluation_hook': ['EvaluationHook'],
|
||||
'hook': ['Hook'],
|
||||
'iter_timer_hook': ['IterTimerHook'],
|
||||
'logger': ['TensorboardHook', 'TextLoggerHook'],
|
||||
'lr_scheduler_hook': ['LrSchedulerHook'],
|
||||
'optimizer_hook': [
|
||||
'ApexAMPOptimizerHook', 'NoneOptimizerHook', 'OptimizerHook',
|
||||
'TorchAMPOptimizerHook'
|
||||
],
|
||||
'priority': ['Priority', 'get']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
import os
|
||||
|
||||
from modelscope import __version__
|
||||
from modelscope.metainfo import Hooks
|
||||
from modelscope.utils.checkpoint import save_checkpoint
|
||||
from modelscope.utils.constant import LogKeys
|
||||
from modelscope.utils.logger import get_logger
|
||||
@@ -11,7 +12,7 @@ from .hook import Hook
|
||||
from .priority import Priority
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
@HOOKS.register_module(module_name=Hooks.CheckpointHook)
|
||||
class CheckpointHook(Hook):
|
||||
"""Save checkpoints periodically.
|
||||
|
||||
@@ -98,7 +99,7 @@ class CheckpointHook(Hook):
|
||||
return False
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
@HOOKS.register_module(module_name=Hooks.BestCkptSaverHook)
|
||||
class BestCkptSaverHook(CheckpointHook):
|
||||
"""Save best checkpoints hook.
|
||||
Args:
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from modelscope.metainfo import Hooks
|
||||
from .builder import HOOKS
|
||||
from .hook import Hook
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
@HOOKS.register_module(module_name=Hooks.EvaluationHook)
|
||||
class EvaluationHook(Hook):
|
||||
"""Evaluation hook.
|
||||
Args:
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import time
|
||||
|
||||
from modelscope.metainfo import Hooks
|
||||
from modelscope.utils.constant import LogKeys
|
||||
from .builder import HOOKS
|
||||
from .hook import Hook
|
||||
from .priority import Priority
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
@HOOKS.register_module(module_name=Hooks.IterTimerHook)
|
||||
class IterTimerHook(Hook):
|
||||
PRIORITY = Priority.LOW
|
||||
|
||||
|
||||
@@ -1,7 +1,27 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from modelscope.trainers.utils.log_buffer import LogBuffer
|
||||
from .base import LoggerHook
|
||||
from .tensorboard_hook import TensorboardHook
|
||||
from .text_logger_hook import TextLoggerHook
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
__all__ = ['TextLoggerHook', 'LoggerHook', 'LogBuffer', 'TensorboardHook']
|
||||
from modelscope.trainers.utils.log_buffer import LogBuffer
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base import LoggerHook
|
||||
from .tensorboard_hook import TensorboardHook
|
||||
from .text_logger_hook import TextLoggerHook
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'base': ['LoggerHook'],
|
||||
'tensorboard_hook': ['TensorboardHook'],
|
||||
'text_logger_hook': ['TextLoggerHook']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
|
||||
from modelscope.metainfo import Hooks
|
||||
from modelscope.trainers.hooks.builder import HOOKS
|
||||
from modelscope.utils.constant import LogKeys
|
||||
from modelscope.utils.torch_utils import master_only
|
||||
from .base import LoggerHook
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
@HOOKS.register_module(module_name=Hooks.TensorboardHook)
|
||||
class TensorboardHook(LoggerHook):
|
||||
"""TensorBoard hook for visualization.
|
||||
Args:
|
||||
|
||||
@@ -8,13 +8,14 @@ import json
|
||||
import torch
|
||||
from torch import distributed as dist
|
||||
|
||||
from modelscope.metainfo import Hooks
|
||||
from modelscope.trainers.hooks.builder import HOOKS
|
||||
from modelscope.trainers.hooks.logger.base import LoggerHook
|
||||
from modelscope.utils.constant import LogKeys, ModeKeys
|
||||
from modelscope.utils.torch_utils import get_dist_info, is_master
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
@HOOKS.register_module(module_name=Hooks.TextLoggerHook)
|
||||
class TextLoggerHook(LoggerHook):
|
||||
"""Logger hook in text, Output log to both console and local json file.
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from modelscope.metainfo import Hooks
|
||||
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
|
||||
from modelscope.utils.constant import LogKeys
|
||||
from modelscope.utils.logger import get_logger
|
||||
@@ -8,7 +9,7 @@ from .hook import Hook
|
||||
from .priority import Priority
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
@HOOKS.register_module(module_name=Hooks.LrSchedulerHook)
|
||||
class LrSchedulerHook(Hook):
|
||||
"""Lr scheduler.
|
||||
|
||||
@@ -78,7 +79,7 @@ class LrSchedulerHook(Hook):
|
||||
return lr
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
@HOOKS.register_module(module_name=Hooks.PlateauLrSchedulerHook)
|
||||
class PlateauLrSchedulerHook(LrSchedulerHook):
|
||||
"""Lr scheduler hook for `ReduceLROnPlateau`.
|
||||
|
||||
@@ -119,7 +120,7 @@ class PlateauLrSchedulerHook(LrSchedulerHook):
|
||||
trainer.lr_scheduler.step(metrics=metrics)
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
@HOOKS.register_module(module_name=Hooks.NoneLrSchedulerHook)
|
||||
class NoneLrSchedulerHook(LrSchedulerHook):
|
||||
|
||||
PRIORITY = Priority.LOW # should be after EvaluationHook
|
||||
|
||||
26
modelscope/trainers/hooks/optimizer/__init__.py
Normal file
26
modelscope/trainers/hooks/optimizer/__init__.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .apex_optimizer_hook import ApexAMPOptimizerHook
|
||||
from .base import OptimizerHook, NoneOptimizerHook
|
||||
from .torch_optimizer_hook import TorchAMPOptimizerHook
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'apex_optimizer_hook': ['ApexAMPOptimizerHook'],
|
||||
'base': ['OptimizerHook', 'NoneOptimizerHook'],
|
||||
'torch_optimizer_hook': ['TorchAMPOptimizerHook']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
75
modelscope/trainers/hooks/optimizer/apex_optimizer_hook.py
Normal file
75
modelscope/trainers/hooks/optimizer/apex_optimizer_hook.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import logging
|
||||
|
||||
from modelscope.metainfo import Hooks
|
||||
from modelscope.trainers.hooks.builder import HOOKS
|
||||
from .base import OptimizerHook
|
||||
|
||||
|
||||
@HOOKS.register_module(module_name=Hooks.ApexAMPOptimizerHook)
|
||||
class ApexAMPOptimizerHook(OptimizerHook):
|
||||
"""Fp16 optimizer, if torch version is less than 1.6.0,
|
||||
you must install apex (https://www.github.com/nvidia/apex) else use torch.cuda.amp by default
|
||||
Args:
|
||||
cumulative_iters (int): interval of gradients accumulation. Default: 1
|
||||
grad_clip (dict): Default None. Containing keys:
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_`
|
||||
loss_keys (str | list): keys list of loss
|
||||
opt_level (str): "O0" and "O3" are not true mixed precision,
|
||||
but they are useful for establishing accuracy and speed baselines, respectively.
|
||||
"O1" and "O2" are different implementations of mixed precision.
|
||||
Try both, and see what gives the best speedup and accuracy for your model.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
cumulative_iters=1,
|
||||
grad_clip=None,
|
||||
loss_keys='loss',
|
||||
opt_level='O1'):
|
||||
|
||||
super(ApexAMPOptimizerHook, self).__init__(
|
||||
grad_clip=grad_clip, loss_keys=loss_keys)
|
||||
self.cumulative_iters = cumulative_iters
|
||||
self.opt_level = opt_level
|
||||
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
'apex not installed, please install apex from https://www.github.com/nvidia/apex.'
|
||||
)
|
||||
|
||||
def before_run(self, trainer):
|
||||
from apex import amp
|
||||
|
||||
logging.info('open fp16')
|
||||
# TODO: fix it should initialze amp with model not wrapper by DDP or DP
|
||||
if hasattr(trainer.model, 'module'):
|
||||
trainer.model, trainer.optimizer = amp.initialize(
|
||||
trainer.model.module,
|
||||
trainer.optimizer,
|
||||
opt_level=self.opt_level)
|
||||
else:
|
||||
trainer.model, trainer.optimizer = amp.initialize(
|
||||
trainer.model, trainer.optimizer, opt_level=self.opt_level)
|
||||
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
def after_train_iter(self, trainer):
|
||||
for k in self.loss_keys:
|
||||
trainer.train_outputs[k] /= self.cumulative_iters
|
||||
|
||||
from apex import amp
|
||||
for k in self.loss_keys:
|
||||
with amp.scale_loss(trainer.train_outputs[k],
|
||||
trainer.optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
|
||||
if self.every_n_iters(trainer, self.cumulative_iters):
|
||||
if self.grad_clip is not None:
|
||||
self.clip_grads(trainer.model.parameters(), **self.grad_clip)
|
||||
|
||||
trainer.optimizer.step()
|
||||
trainer.optimizer.zero_grad()
|
||||
73
modelscope/trainers/hooks/optimizer/base.py
Normal file
73
modelscope/trainers/hooks/optimizer/base.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import logging
|
||||
|
||||
from torch.nn.utils import clip_grad
|
||||
|
||||
from modelscope.metainfo import Hooks
|
||||
from modelscope.trainers.hooks.builder import HOOKS
|
||||
from modelscope.trainers.hooks.hook import Hook
|
||||
from modelscope.trainers.hooks.priority import Priority
|
||||
|
||||
|
||||
@HOOKS.register_module(module_name=Hooks.OptimizerHook)
|
||||
class OptimizerHook(Hook):
|
||||
"""Optimizer hook
|
||||
|
||||
Args:
|
||||
cumulative_iters (int): interval of gradients accumulation. Default: 1
|
||||
grad_clip (dict): Default None. Containing keys:
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_`
|
||||
loss_keys (str | list): keys list of loss
|
||||
"""
|
||||
|
||||
PRIORITY = Priority.ABOVE_NORMAL
|
||||
|
||||
def __init__(self,
|
||||
cumulative_iters=1,
|
||||
grad_clip=None,
|
||||
loss_keys='loss') -> None:
|
||||
if isinstance(loss_keys, str):
|
||||
loss_keys = [loss_keys]
|
||||
assert isinstance(loss_keys, (tuple, list))
|
||||
self.loss_keys = loss_keys
|
||||
self.cumulative_iters = cumulative_iters
|
||||
self.grad_clip = grad_clip
|
||||
|
||||
def clip_grads(self, params, **clip_args):
|
||||
params = list(
|
||||
filter(lambda p: p.requires_grad and p.grad is not None, params))
|
||||
if len(params) > 0:
|
||||
return clip_grad.clip_grad_norm_(params, **clip_args)
|
||||
|
||||
def before_run(self, trainer):
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
def after_train_iter(self, trainer):
|
||||
for k in self.loss_keys:
|
||||
trainer.train_outputs[k] /= self.cumulative_iters
|
||||
trainer.train_outputs[k].backward()
|
||||
|
||||
if self.every_n_iters(trainer, self.cumulative_iters):
|
||||
if self.grad_clip is not None:
|
||||
self.clip_grads(trainer.model.parameters(), **self.grad_clip)
|
||||
|
||||
trainer.optimizer.step()
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
|
||||
@HOOKS.register_module(module_name=Hooks.NoneOptimizerHook)
|
||||
class NoneOptimizerHook(OptimizerHook):
|
||||
|
||||
def __init__(self, cumulative_iters=1, grad_clip=None, loss_keys='loss'):
|
||||
|
||||
super(NoneOptimizerHook, self).__init__(
|
||||
grad_clip=grad_clip, loss_keys=loss_keys)
|
||||
self.cumulative_iters = cumulative_iters
|
||||
|
||||
def before_run(self, trainer):
|
||||
return
|
||||
|
||||
def after_train_iter(self, trainer):
|
||||
return
|
||||
83
modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
Normal file
83
modelscope/trainers/hooks/optimizer/torch_optimizer_hook.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import logging
|
||||
|
||||
from modelscope.metainfo import Hooks
|
||||
from modelscope.trainers.hooks.builder import HOOKS
|
||||
from .base import OptimizerHook
|
||||
|
||||
|
||||
@HOOKS.register_module(module_name=Hooks.TorchAMPOptimizerHook)
|
||||
class TorchAMPOptimizerHook(OptimizerHook):
|
||||
"""Fp16 optimizer, if torch version is less than 1.6.0,
|
||||
you must install apex (https://www.github.com/nvidia/apex) else use torch.cuda.amp by default
|
||||
Args:
|
||||
cumulative_iters (int): interval of gradients accumulation. Default: 1
|
||||
grad_clip (dict): Default None. Containing keys:
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_`
|
||||
loss_keys (str | list): keys list of loss
|
||||
loss_scale (float | dict): grade scale config. If loss_scale is a float,
|
||||
static loss scaling will be used with the specified scale.
|
||||
It can also be a dict containing arguments of GradScalar. For Pytorch >= 1.6,
|
||||
we use official torch.cuda.amp.GradScaler.
|
||||
please refer to: https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler for the parameters.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
cumulative_iters=1,
|
||||
grad_clip=None,
|
||||
loss_keys='loss',
|
||||
loss_scale={}):
|
||||
|
||||
super(TorchAMPOptimizerHook, self).__init__(
|
||||
grad_clip=grad_clip, loss_keys=loss_keys)
|
||||
self.cumulative_iters = cumulative_iters
|
||||
self._scale_update_param = None
|
||||
|
||||
from torch.cuda import amp
|
||||
|
||||
if isinstance(loss_scale, float):
|
||||
self._scale_update_param = loss_scale
|
||||
self.scaler = amp.GradScaler(init_scale=loss_scale)
|
||||
elif isinstance(loss_scale, dict):
|
||||
self.scaler = amp.GradScaler(**loss_scale)
|
||||
else:
|
||||
raise ValueError(
|
||||
'`loss_scale` type must be in [float, dict], but got {loss_scale}'
|
||||
)
|
||||
|
||||
def before_run(self, trainer):
|
||||
logging.info('open fp16')
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
if hasattr(trainer.model, 'module'):
|
||||
self._ori_model_forward = trainer.model.module.forward
|
||||
self._model = trainer.model.module
|
||||
else:
|
||||
self._ori_model_forward = trainer.model.forward
|
||||
self._model = trainer.model
|
||||
|
||||
self.ori_model_forward = trainer.model.forward
|
||||
|
||||
def before_train_iter(self, trainer):
|
||||
from torch.cuda import amp
|
||||
setattr(self._model, 'forward', amp.autocast()(self._model.forward))
|
||||
|
||||
def after_train_iter(self, trainer):
|
||||
for k in self.loss_keys:
|
||||
trainer.train_outputs[k] /= self.cumulative_iters
|
||||
|
||||
for k in self.loss_keys:
|
||||
self.scaler.scale(trainer.train_outputs[k]).backward()
|
||||
|
||||
if self.every_n_iters(trainer, self.cumulative_iters):
|
||||
self.scaler.unscale_(trainer.optimizer)
|
||||
if self.grad_clip is not None:
|
||||
self.clip_grads(trainer.model.parameters(), **self.grad_clip)
|
||||
|
||||
self.scaler.step(trainer.optimizer)
|
||||
self.scaler.update(self._scale_update_param)
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
setattr(self._model, 'forward', self._ori_model_forward)
|
||||
@@ -1,218 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import logging
|
||||
|
||||
from torch.nn.utils import clip_grad
|
||||
|
||||
from .builder import HOOKS
|
||||
from .hook import Hook
|
||||
from .priority import Priority
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
class OptimizerHook(Hook):
|
||||
"""Optimizer hook
|
||||
|
||||
Args:
|
||||
cumulative_iters (int): interval of gradients accumulation. Default: 1
|
||||
grad_clip (dict): Default None. Containing keys:
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_`
|
||||
loss_keys (str | list): keys list of loss
|
||||
"""
|
||||
|
||||
PRIORITY = Priority.ABOVE_NORMAL
|
||||
|
||||
def __init__(self,
|
||||
cumulative_iters=1,
|
||||
grad_clip=None,
|
||||
loss_keys='loss') -> None:
|
||||
if isinstance(loss_keys, str):
|
||||
loss_keys = [loss_keys]
|
||||
assert isinstance(loss_keys, (tuple, list))
|
||||
self.loss_keys = loss_keys
|
||||
self.cumulative_iters = cumulative_iters
|
||||
self.grad_clip = grad_clip
|
||||
|
||||
def clip_grads(self, params, **clip_args):
|
||||
params = list(
|
||||
filter(lambda p: p.requires_grad and p.grad is not None, params))
|
||||
if len(params) > 0:
|
||||
return clip_grad.clip_grad_norm_(params, **clip_args)
|
||||
|
||||
def before_run(self, trainer):
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
def after_train_iter(self, trainer):
|
||||
for k in self.loss_keys:
|
||||
trainer.train_outputs[k] /= self.cumulative_iters
|
||||
trainer.train_outputs[k].backward()
|
||||
|
||||
if self.every_n_iters(trainer, self.cumulative_iters):
|
||||
if self.grad_clip is not None:
|
||||
self.clip_grads(trainer.model.parameters(), **self.grad_clip)
|
||||
|
||||
trainer.optimizer.step()
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
class TorchAMPOptimizerHook(OptimizerHook):
|
||||
"""Fp16 optimizer, if torch version is less than 1.6.0,
|
||||
you must install apex (https://www.github.com/nvidia/apex) else use torch.cuda.amp by default
|
||||
Args:
|
||||
cumulative_iters (int): interval of gradients accumulation. Default: 1
|
||||
grad_clip (dict): Default None. Containing keys:
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_`
|
||||
loss_keys (str | list): keys list of loss
|
||||
loss_scale (float | dict): grade scale config. If loss_scale is a float,
|
||||
static loss scaling will be used with the specified scale.
|
||||
It can also be a dict containing arguments of GradScalar. For Pytorch >= 1.6,
|
||||
we use official torch.cuda.amp.GradScaler.
|
||||
please refer to: https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler for the parameters.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
cumulative_iters=1,
|
||||
grad_clip=None,
|
||||
loss_keys='loss',
|
||||
loss_scale={}):
|
||||
|
||||
super(TorchAMPOptimizerHook, self).__init__(
|
||||
grad_clip=grad_clip, loss_keys=loss_keys)
|
||||
self.cumulative_iters = cumulative_iters
|
||||
self._scale_update_param = None
|
||||
|
||||
from torch.cuda import amp
|
||||
|
||||
if isinstance(loss_scale, float):
|
||||
self._scale_update_param = loss_scale
|
||||
self.scaler = amp.GradScaler(init_scale=loss_scale)
|
||||
elif isinstance(loss_scale, dict):
|
||||
self.scaler = amp.GradScaler(**loss_scale)
|
||||
else:
|
||||
raise ValueError(
|
||||
'`loss_scale` type must be in [float, dict], but got {loss_scale}'
|
||||
)
|
||||
|
||||
def before_run(self, trainer):
|
||||
logging.info('open fp16')
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
if hasattr(trainer.model, 'module'):
|
||||
self._ori_model_forward = trainer.model.module.forward
|
||||
self._model = trainer.model.module
|
||||
else:
|
||||
self._ori_model_forward = trainer.model.forward
|
||||
self._model = trainer.model
|
||||
|
||||
self.ori_model_forward = trainer.model.forward
|
||||
|
||||
def before_train_iter(self, trainer):
|
||||
from torch.cuda import amp
|
||||
setattr(self._model, 'forward', amp.autocast()(self._model.forward))
|
||||
|
||||
def after_train_iter(self, trainer):
|
||||
for k in self.loss_keys:
|
||||
trainer.train_outputs[k] /= self.cumulative_iters
|
||||
|
||||
for k in self.loss_keys:
|
||||
self.scaler.scale(trainer.train_outputs[k]).backward()
|
||||
|
||||
if self.every_n_iters(trainer, self.cumulative_iters):
|
||||
self.scaler.unscale_(trainer.optimizer)
|
||||
if self.grad_clip is not None:
|
||||
self.clip_grads(trainer.model.parameters(), **self.grad_clip)
|
||||
|
||||
self.scaler.step(trainer.optimizer)
|
||||
self.scaler.update(self._scale_update_param)
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
setattr(self._model, 'forward', self._ori_model_forward)
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
class ApexAMPOptimizerHook(OptimizerHook):
|
||||
"""Fp16 optimizer, if torch version is less than 1.6.0,
|
||||
you must install apex (https://www.github.com/nvidia/apex) else use torch.cuda.amp by default
|
||||
Args:
|
||||
cumulative_iters (int): interval of gradients accumulation. Default: 1
|
||||
grad_clip (dict): Default None. Containing keys:
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
More details please refer to `torch.nn.utils.clip_grad.clip_grad_norm_`
|
||||
loss_keys (str | list): keys list of loss
|
||||
opt_level (str): "O0" and "O3" are not true mixed precision,
|
||||
but they are useful for establishing accuracy and speed baselines, respectively.
|
||||
"O1" and "O2" are different implementations of mixed precision.
|
||||
Try both, and see what gives the best speedup and accuracy for your model.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
cumulative_iters=1,
|
||||
grad_clip=None,
|
||||
loss_keys='loss',
|
||||
opt_level='O1'):
|
||||
|
||||
super(ApexAMPOptimizerHook, self).__init__(
|
||||
grad_clip=grad_clip, loss_keys=loss_keys)
|
||||
self.cumulative_iters = cumulative_iters
|
||||
self.opt_level = opt_level
|
||||
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
'apex not installed, please install apex from https://www.github.com/nvidia/apex.'
|
||||
)
|
||||
|
||||
def before_run(self, trainer):
|
||||
from apex import amp
|
||||
|
||||
logging.info('open fp16')
|
||||
# TODO: fix it should initialze amp with model not wrapper by DDP or DP
|
||||
if hasattr(trainer.model, 'module'):
|
||||
trainer.model, trainer.optimizer = amp.initialize(
|
||||
trainer.model.module,
|
||||
trainer.optimizer,
|
||||
opt_level=self.opt_level)
|
||||
else:
|
||||
trainer.model, trainer.optimizer = amp.initialize(
|
||||
trainer.model, trainer.optimizer, opt_level=self.opt_level)
|
||||
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
def after_train_iter(self, trainer):
|
||||
for k in self.loss_keys:
|
||||
trainer.train_outputs[k] /= self.cumulative_iters
|
||||
|
||||
from apex import amp
|
||||
for k in self.loss_keys:
|
||||
with amp.scale_loss(trainer.train_outputs[k],
|
||||
trainer.optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
|
||||
if self.every_n_iters(trainer, self.cumulative_iters):
|
||||
if self.grad_clip is not None:
|
||||
self.clip_grads(trainer.model.parameters(), **self.grad_clip)
|
||||
|
||||
trainer.optimizer.step()
|
||||
trainer.optimizer.zero_grad()
|
||||
|
||||
|
||||
@HOOKS.register_module()
|
||||
class NoneOptimizerHook(OptimizerHook):
|
||||
|
||||
def __init__(self, cumulative_iters=1, grad_clip=None, loss_keys='loss'):
|
||||
|
||||
super(NoneOptimizerHook, self).__init__(
|
||||
grad_clip=grad_clip, loss_keys=loss_keys)
|
||||
self.cumulative_iters = cumulative_iters
|
||||
|
||||
def before_run(self, trainer):
|
||||
return
|
||||
|
||||
def after_train_iter(self, trainer):
|
||||
return
|
||||
@@ -1,8 +1,25 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .builder import LR_SCHEDULER, build_lr_scheduler
|
||||
from .warmup import BaseWarmup, ConstantWarmup, ExponentialWarmup, LinearWarmup
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
__all__ = [
|
||||
'LR_SCHEDULER', 'build_lr_scheduler', 'BaseWarmup', 'ConstantWarmup',
|
||||
'LinearWarmup', 'ExponentialWarmup'
|
||||
]
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .builder import LR_SCHEDULER, build_lr_scheduler
|
||||
from .warmup import BaseWarmup, ConstantWarmup, ExponentialWarmup, LinearWarmup
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'builder': ['LR_SCHEDULER', 'build_lr_scheduler'],
|
||||
'warmup':
|
||||
['BaseWarmup', 'ConstantWarmup', 'ExponentialWarmup', 'LinearWarmup']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
@@ -4,7 +4,7 @@ import inspect
|
||||
from modelscope.utils.config import ConfigDict
|
||||
from modelscope.utils.registry import Registry, build_from_cfg, default_group
|
||||
|
||||
LR_SCHEDULER = Registry('lr scheduler')
|
||||
LR_SCHEDULER = Registry('lr_scheduler')
|
||||
|
||||
|
||||
def build_lr_scheduler(cfg: ConfigDict, default_args: dict = None):
|
||||
|
||||
@@ -1,5 +1,25 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .base import BaseWarmup
|
||||
from .warmup import ConstantWarmup, ExponentialWarmup, LinearWarmup
|
||||
|
||||
__all__ = ['BaseWarmup', 'ConstantWarmup', 'LinearWarmup', 'ExponentialWarmup']
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base import BaseWarmup
|
||||
from .warmup import ConstantWarmup, ExponentialWarmup, LinearWarmup
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'base': ['BaseWarmup'],
|
||||
'warmup': ['ConstantWarmup', 'ExponentialWarmup', 'LinearWarmup']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from modelscope.metainfo import LR_Schedulers
|
||||
from modelscope.trainers.lrscheduler.builder import LR_SCHEDULER
|
||||
from .base import BaseWarmup
|
||||
|
||||
|
||||
@LR_SCHEDULER.register_module()
|
||||
@LR_SCHEDULER.register_module(module_name=LR_Schedulers.ConstantWarmup)
|
||||
class ConstantWarmup(BaseWarmup):
|
||||
"""Linear warmup scheduler.
|
||||
|
||||
@@ -29,7 +30,7 @@ class ConstantWarmup(BaseWarmup):
|
||||
return self.warmup_ratio
|
||||
|
||||
|
||||
@LR_SCHEDULER.register_module()
|
||||
@LR_SCHEDULER.register_module(module_name=LR_Schedulers.LinearWarmup)
|
||||
class LinearWarmup(BaseWarmup):
|
||||
"""Linear warmup scheduler.
|
||||
|
||||
@@ -54,7 +55,7 @@ class LinearWarmup(BaseWarmup):
|
||||
return 1 - k
|
||||
|
||||
|
||||
@LR_SCHEDULER.register_module()
|
||||
@LR_SCHEDULER.register_module(module_name=LR_Schedulers.ExponentialWarmup)
|
||||
class ExponentialWarmup(BaseWarmup):
|
||||
"""Exponential warmup scheduler.
|
||||
|
||||
|
||||
@@ -1 +1,20 @@
|
||||
from .clip import CLIPTrainer
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .clip import CLIPTrainer
|
||||
|
||||
else:
|
||||
_import_structure = {'clip': ['CLIPTrainer']}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
@@ -1 +1,22 @@
|
||||
from .sequence_classification_trainer import SequenceClassificationTrainer
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .sequence_classification_trainer import SequenceClassificationTrainer
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'sequence_classification_trainer': ['SequenceClassificationTrainer']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
@@ -3,6 +3,7 @@ from typing import Dict, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers.base import BaseTrainer
|
||||
from modelscope.trainers.builder import TRAINERS
|
||||
from modelscope.utils.logger import get_logger
|
||||
@@ -11,7 +12,7 @@ PATH = None
|
||||
logger = get_logger(PATH)
|
||||
|
||||
|
||||
@TRAINERS.register_module(module_name=r'bert-sentiment-analysis')
|
||||
@TRAINERS.register_module(module_name=Trainers.bert_sentiment_analysis)
|
||||
class SequenceClassificationTrainer(BaseTrainer):
|
||||
|
||||
def __init__(self, cfg_file: str, *args, **kwargs):
|
||||
|
||||
@@ -6,6 +6,7 @@ from torch import nn
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics.builder import build_metric
|
||||
from modelscope.models.base import Model, TorchModel
|
||||
from modelscope.msdatasets import MsDataset
|
||||
@@ -17,7 +18,7 @@ from .base import TRAINERS
|
||||
from .trainer import EpochBasedTrainer
|
||||
|
||||
|
||||
@TRAINERS.register_module(module_name='NlpEpochBasedTrainer')
|
||||
@TRAINERS.register_module(module_name=Trainers.nlp_base_trainer)
|
||||
class NlpEpochBasedTrainer(EpochBasedTrainer):
|
||||
|
||||
def __init__(
|
||||
@@ -142,7 +143,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
|
||||
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
|
||||
|
||||
|
||||
@TRAINERS.register_module(module_name='VecoTrainer')
|
||||
@TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer)
|
||||
class VecoTrainer(NlpEpochBasedTrainer):
|
||||
|
||||
def evaluate(self, checkpoint_path=None):
|
||||
|
||||
@@ -17,6 +17,7 @@ from torch.utils.data import DataLoader, Dataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics import build_metric, task_default_metrics
|
||||
from modelscope.models.base import Model, TorchModel
|
||||
from modelscope.msdatasets.ms_dataset import MsDataset
|
||||
@@ -45,7 +46,7 @@ from .parallel.builder import build_parallel
|
||||
from .parallel.utils import is_parallel
|
||||
|
||||
|
||||
@TRAINERS.register_module()
|
||||
@TRAINERS.register_module(module_name=Trainers.default)
|
||||
class EpochBasedTrainer(BaseTrainer):
|
||||
"""Epoch based Trainer, a training helper for PyTorch.
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import importlib
|
||||
import os
|
||||
import os.path as osp
|
||||
import time
|
||||
import traceback
|
||||
from functools import reduce
|
||||
from typing import Generator, Union
|
||||
|
||||
@@ -13,8 +14,9 @@ import json
|
||||
|
||||
from modelscope import __version__
|
||||
from modelscope.fileio.file import LocalStorage
|
||||
from modelscope.metainfo import (Heads, Metrics, Models, Pipelines,
|
||||
Preprocessors, TaskModels, Trainers)
|
||||
from modelscope.metainfo import (Heads, Hooks, LR_Schedulers, Metrics, Models,
|
||||
Optimizers, Pipelines, Preprocessors,
|
||||
TaskModels, Trainers)
|
||||
from modelscope.utils.constant import Fields, Tasks
|
||||
from modelscope.utils.file_utils import get_default_cache_dir
|
||||
from modelscope.utils.logger import get_logger
|
||||
@@ -28,7 +30,8 @@ MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1])
|
||||
REGISTER_MODULE = 'register_module'
|
||||
IGNORED_PACKAGES = ['modelscope', '.']
|
||||
SCAN_SUB_FOLDERS = [
|
||||
'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets'
|
||||
'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets',
|
||||
'trainers'
|
||||
]
|
||||
INDEXER_FILE = 'ast_indexer'
|
||||
DECORATOR_KEY = 'decorators'
|
||||
@@ -305,9 +308,11 @@ class AstScaning(object):
|
||||
output = [functions[0]]
|
||||
|
||||
if len(args_list) == 0 and len(keyword_list) == 0:
|
||||
args_list.append(None)
|
||||
args_list.append(default_group)
|
||||
if len(keyword_list) == 0 and len(args_list) == 1:
|
||||
args_list.append(None)
|
||||
if len(keyword_list) == 1 and len(args_list) == 0:
|
||||
args_list.append(default_group)
|
||||
|
||||
args_list.extend(keyword_list)
|
||||
|
||||
@@ -318,6 +323,8 @@ class AstScaning(object):
|
||||
# the case (default_group)
|
||||
elif item[1] is None:
|
||||
output.append(item[0])
|
||||
elif isinstance(item, str):
|
||||
output.append(item)
|
||||
else:
|
||||
output.append('.'.join(item))
|
||||
return (output[0], self._get_registry_value(output[1]),
|
||||
@@ -443,9 +450,11 @@ class FilesAstScaning(object):
|
||||
try:
|
||||
output = self.astScaner.generate_ast(file)
|
||||
except Exception as e:
|
||||
detail = traceback.extract_tb(e.__traceback__)
|
||||
raise Exception(
|
||||
'During ast indexing, there are index errors in the '
|
||||
f'file {file} : {type(e).__name__}.{e}')
|
||||
f'During ast indexing, error is in the file {detail[-1].filename}'
|
||||
f' line: {detail[-1].lineno}: "{detail[-1].line}" with error msg: '
|
||||
f'"{type(e).__name__}: {e}"')
|
||||
|
||||
import_list = self.parse_import(output)
|
||||
return output[DECORATOR_KEY], import_list
|
||||
@@ -523,14 +532,14 @@ class FilesAstScaning(object):
|
||||
return md5.hexdigest()
|
||||
|
||||
|
||||
fileScaner = FilesAstScaning()
|
||||
file_scanner = FilesAstScaning()
|
||||
|
||||
|
||||
def _save_index(index, file_path):
|
||||
# convert tuple key to str key
|
||||
index[INDEX_KEY] = {str(k): v for k, v in index[INDEX_KEY].items()}
|
||||
index[VERSION_KEY] = __version__
|
||||
index[MD5_KEY] = fileScaner.files_mtime_md5()
|
||||
index[MD5_KEY] = file_scanner.files_mtime_md5()
|
||||
json_index = json.dumps(index)
|
||||
storage.write(json_index.encode(), file_path)
|
||||
index[INDEX_KEY] = {
|
||||
@@ -579,7 +588,7 @@ def load_index(force_rebuild=False):
|
||||
index = None
|
||||
if not force_rebuild and os.path.exists(file_path):
|
||||
wrapped_index = _load_index(file_path)
|
||||
md5 = fileScaner.files_mtime_md5()
|
||||
md5 = file_scanner.files_mtime_md5()
|
||||
if (wrapped_index[VERSION_KEY] == __version__
|
||||
and wrapped_index[MD5_KEY] == md5):
|
||||
index = wrapped_index
|
||||
@@ -591,7 +600,7 @@ def load_index(force_rebuild=False):
|
||||
logger.info(
|
||||
f'No valid ast index found from {file_path}, rebuilding ast index!'
|
||||
)
|
||||
index = fileScaner.get_files_scan_results()
|
||||
index = file_scanner.get_files_scan_results()
|
||||
_save_index(index, file_path)
|
||||
return index
|
||||
|
||||
|
||||
@@ -7,4 +7,6 @@ pycocotools>=2.0.4
|
||||
# which introduced compatability issues that are being investigated
|
||||
rouge_score<=0.0.4
|
||||
timm
|
||||
tokenizers
|
||||
torchvision
|
||||
transformers>=4.12.0
|
||||
|
||||
@@ -6,3 +6,5 @@ pai-easynlp
|
||||
rouge_score<=0.0.4
|
||||
seqeval
|
||||
spacy>=2.3.5
|
||||
tokenizers
|
||||
transformers>=4.12.0
|
||||
|
||||
@@ -13,7 +13,5 @@ requests
|
||||
scipy
|
||||
setuptools
|
||||
tensorboard
|
||||
tokenizers
|
||||
tqdm>=4.64.0
|
||||
transformers>=4.12.0
|
||||
yapf
|
||||
|
||||
@@ -10,6 +10,7 @@ import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.utils.constant import LogKeys, ModelFile
|
||||
from modelscope.utils.test_utils import create_dummy_test_dataset
|
||||
@@ -73,7 +74,7 @@ class TensorboardHookTest(unittest.TestCase):
|
||||
with open(config_path, 'w') as f:
|
||||
json.dump(json_cfg, f)
|
||||
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=DummyModel(),
|
||||
|
||||
@@ -9,6 +9,7 @@ import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics.builder import METRICS, MetricKeys
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.utils.constant import LogKeys, ModelFile
|
||||
@@ -108,7 +109,7 @@ class CheckpointHookTest(unittest.TestCase):
|
||||
with open(config_path, 'w') as f:
|
||||
json.dump(json_cfg, f)
|
||||
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=DummyModel(),
|
||||
@@ -179,7 +180,7 @@ class BestCkptSaverHookTest(unittest.TestCase):
|
||||
with open(config_path, 'w') as f:
|
||||
json.dump(json_cfg, f)
|
||||
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=DummyModel(),
|
||||
|
||||
@@ -9,6 +9,7 @@ import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics.builder import METRICS, MetricKeys
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.utils.constant import LogKeys, ModelFile
|
||||
@@ -97,7 +98,7 @@ class EvaluationHookTest(unittest.TestCase):
|
||||
with open(config_path, 'w') as f:
|
||||
json.dump(json_cfg, f)
|
||||
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=DummyModel(),
|
||||
|
||||
@@ -11,6 +11,7 @@ from torch import nn
|
||||
from torch.optim import SGD
|
||||
from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics.builder import METRICS, MetricKeys
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
|
||||
@@ -89,7 +90,7 @@ class LrSchedulerHookTest(unittest.TestCase):
|
||||
model = DummyModel()
|
||||
optimizer = SGD(model.parameters(), lr=0.01)
|
||||
lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4])
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
@@ -161,7 +162,7 @@ class LrSchedulerHookTest(unittest.TestCase):
|
||||
model = DummyModel()
|
||||
# optimmizer = SGD(model.parameters(), lr=0.01)
|
||||
# lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4])
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
@@ -258,7 +259,7 @@ class PlateauLrSchedulerHookTest(unittest.TestCase):
|
||||
|
||||
model = DummyModel()
|
||||
optimizer = SGD(model.parameters(), lr=0.01)
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
|
||||
@@ -11,6 +11,7 @@ from torch import nn
|
||||
from torch.optim import SGD
|
||||
from torch.optim.lr_scheduler import MultiStepLR
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.utils.constant import ModelFile, TrainerStages
|
||||
from modelscope.utils.test_utils import create_dummy_test_dataset
|
||||
@@ -64,7 +65,7 @@ class OptimizerHookTest(unittest.TestCase):
|
||||
model = DummyModel()
|
||||
optimizer = SGD(model.parameters(), lr=0.01)
|
||||
lr_scheduler = MultiStepLR(optimizer, milestones=[1, 2])
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
@@ -130,7 +131,7 @@ class TorchAMPOptimizerHookTest(unittest.TestCase):
|
||||
model = DummyModel().cuda()
|
||||
optimizer = SGD(model.parameters(), lr=0.01)
|
||||
lr_scheduler = MultiStepLR(optimizer, milestones=[1, 2])
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
|
||||
@@ -11,6 +11,7 @@ from torch import nn
|
||||
from torch.optim import SGD
|
||||
from torch.optim.lr_scheduler import MultiStepLR
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
|
||||
from modelscope.utils.test_utils import create_dummy_test_dataset
|
||||
@@ -68,7 +69,7 @@ class IterTimerHookTest(unittest.TestCase):
|
||||
model = DummyModel()
|
||||
optimizer = SGD(model.parameters(), lr=0.01)
|
||||
lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4])
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
|
||||
@@ -4,6 +4,7 @@ import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers import build_trainer
|
||||
|
||||
|
||||
@@ -23,7 +24,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
|
||||
model_id,
|
||||
train_dataset,
|
||||
eval_dataset,
|
||||
name='NlpEpochBasedTrainer',
|
||||
name=Trainers.nlp_base_trainer,
|
||||
cfg_modify_fn=None,
|
||||
**kwargs):
|
||||
kwargs = dict(
|
||||
@@ -236,7 +237,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
|
||||
'damo/nlp_veco_fill-mask-large',
|
||||
train_datasets,
|
||||
eval_datasets,
|
||||
name='VecoTrainer',
|
||||
name=Trainers.nlp_veco_trainer,
|
||||
cfg_modify_fn=cfg_modify_fn)
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import tempfile
|
||||
import unittest
|
||||
from functools import reduce
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
@@ -25,7 +26,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
|
||||
model_id,
|
||||
train_dataset,
|
||||
eval_dataset,
|
||||
name='NlpEpochBasedTrainer',
|
||||
name=Trainers.nlp_base_trainer,
|
||||
cfg_modify_fn=None,
|
||||
**kwargs):
|
||||
kwargs = dict(
|
||||
|
||||
@@ -7,6 +7,7 @@ import zipfile
|
||||
from functools import partial
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.models.cv.image_instance_segmentation import (
|
||||
CascadeMaskRCNNSwinModel, ImageInstanceSegmentationCocoDataset)
|
||||
from modelscope.trainers import build_trainer
|
||||
@@ -79,7 +80,7 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
|
||||
work_dir=self.tmp_dir)
|
||||
|
||||
trainer = build_trainer(
|
||||
name='image-instance-segmentation', default_args=kwargs)
|
||||
name=Trainers.image_instance_segmentation, default_args=kwargs)
|
||||
trainer.train()
|
||||
results_files = os.listdir(self.tmp_dir)
|
||||
self.assertIn(f'{trainer.timestamp}.log.json', results_files)
|
||||
@@ -103,7 +104,7 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
|
||||
work_dir=self.tmp_dir)
|
||||
|
||||
trainer = build_trainer(
|
||||
name='image-instance-segmentation', default_args=kwargs)
|
||||
name=Trainers.image_instance_segmentation, default_args=kwargs)
|
||||
trainer.train()
|
||||
results_files = os.listdir(self.tmp_dir)
|
||||
self.assertIn(f'{trainer.timestamp}.log.json', results_files)
|
||||
|
||||
@@ -11,6 +11,7 @@ import torch
|
||||
from torch.utils import data as data
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.models.cv.image_portrait_enhancement import \
|
||||
ImagePortraitEnhancement
|
||||
from modelscope.trainers import build_trainer
|
||||
@@ -91,7 +92,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
|
||||
device='gpu',
|
||||
work_dir=self.tmp_dir)
|
||||
|
||||
trainer = build_trainer(name='gpen', default_args=kwargs)
|
||||
trainer = build_trainer(
|
||||
name=Trainers.image_portrait_enhancement, default_args=kwargs)
|
||||
trainer.train()
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
@@ -111,7 +113,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
|
||||
max_epochs=2,
|
||||
work_dir=self.tmp_dir)
|
||||
|
||||
trainer = build_trainer(name='gpen', default_args=kwargs)
|
||||
trainer = build_trainer(
|
||||
name=Trainers.image_portrait_enhancement, default_args=kwargs)
|
||||
trainer.train()
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.models.nlp.palm_v2 import PalmForTextGeneration
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.trainers import build_trainer
|
||||
@@ -57,7 +58,7 @@ class TestTextGenerationTrainer(unittest.TestCase):
|
||||
work_dir=self.tmp_dir)
|
||||
|
||||
trainer = build_trainer(
|
||||
name='NlpEpochBasedTrainer', default_args=kwargs)
|
||||
name=Trainers.nlp_base_trainer, default_args=kwargs)
|
||||
trainer.train()
|
||||
results_files = os.listdir(self.tmp_dir)
|
||||
self.assertIn(f'{trainer.timestamp}.log.json', results_files)
|
||||
@@ -122,7 +123,7 @@ class TestTextGenerationTrainer(unittest.TestCase):
|
||||
cfg_modify_fn=cfg_modify_fn,
|
||||
model_revision='beta')
|
||||
trainer = build_trainer(
|
||||
name='NlpEpochBasedTrainer', default_args=kwargs)
|
||||
name=Trainers.nlp_base_trainer, default_args=kwargs)
|
||||
trainer.train()
|
||||
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from torch import nn
|
||||
from torch.optim import SGD
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics.builder import MetricKeys
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.trainers import build_trainer
|
||||
@@ -101,14 +102,14 @@ class TrainerTest(unittest.TestCase):
|
||||
'workers_per_gpu': 1,
|
||||
'shuffle': False
|
||||
},
|
||||
'metrics': ['seq_cls_metric']
|
||||
'metrics': ['seq-cls-metric']
|
||||
}
|
||||
}
|
||||
config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION)
|
||||
with open(config_path, 'w') as f:
|
||||
json.dump(json_cfg, f)
|
||||
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=DummyModel(),
|
||||
@@ -155,7 +156,7 @@ class TrainerTest(unittest.TestCase):
|
||||
'workers_per_gpu': 1,
|
||||
'shuffle': False
|
||||
},
|
||||
'metrics': ['seq_cls_metric']
|
||||
'metrics': ['seq-cls-metric']
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,7 +167,7 @@ class TrainerTest(unittest.TestCase):
|
||||
model = DummyModel()
|
||||
optimmizer = SGD(model.parameters(), lr=0.01)
|
||||
lr_scheduler = StepLR(optimmizer, 2)
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
@@ -205,7 +206,7 @@ class TrainerTest(unittest.TestCase):
|
||||
'workers_per_gpu': 1,
|
||||
'shuffle': False
|
||||
},
|
||||
'metrics': ['seq_cls_metric']
|
||||
'metrics': ['seq-cls-metric']
|
||||
}
|
||||
}
|
||||
|
||||
@@ -216,7 +217,7 @@ class TrainerTest(unittest.TestCase):
|
||||
model = DummyModel()
|
||||
optimmizer = SGD(model.parameters(), lr=0.01)
|
||||
lr_scheduler = StepLR(optimmizer, 2)
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
|
||||
@@ -12,8 +12,9 @@ from torch import nn
|
||||
from torch.optim import SGD
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics.builder import MetricKeys
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
||||
from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
|
||||
from modelscope.utils.test_utils import (DistributedTestCase,
|
||||
create_dummy_test_dataset, test_level)
|
||||
@@ -70,7 +71,7 @@ def train_func(work_dir, dist=False):
|
||||
model = DummyModel()
|
||||
optimmizer = SGD(model.parameters(), lr=0.01)
|
||||
lr_scheduler = StepLR(optimmizer, 2)
|
||||
trainer_name = 'EpochBasedTrainer'
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
|
||||
Reference in New Issue
Block a user