mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-16 16:27:45 +01:00
[to #43850241] fix processor and collate_fn
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9644184 * fix ditributed training and eval
This commit is contained in:
@@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel):
|
||||
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
|
||||
self.model = NAFNet(**self.config.model.network_g)
|
||||
self.loss = PSNRLoss()
|
||||
|
||||
if torch.cuda.is_available():
|
||||
self._device = torch.device('cuda')
|
||||
else:
|
||||
self._device = torch.device('cpu')
|
||||
|
||||
self.model = self.model.to(self._device)
|
||||
self.model = self._load_pretrained(self.model, model_path)
|
||||
|
||||
if self.training:
|
||||
self.model.train()
|
||||
else:
|
||||
self.model.eval()
|
||||
|
||||
def _load_pretrained(self,
|
||||
net,
|
||||
load_path,
|
||||
@@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel):
|
||||
Returns:
|
||||
Dict[str, Tensor]: results
|
||||
"""
|
||||
for key, value in inputs.items():
|
||||
inputs[key] = inputs[key].to(self._device)
|
||||
if self.training:
|
||||
return self._train_forward(**inputs)
|
||||
elif 'target' in inputs:
|
||||
|
||||
@@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
|
||||
if TYPE_CHECKING:
|
||||
from .base import Preprocessor
|
||||
from .builder import PREPROCESSORS, build_preprocessor
|
||||
from .common import Compose
|
||||
from .common import Compose, ToTensor, Filter
|
||||
from .asr import WavToScp
|
||||
from .audio import LinearAECAndFbank
|
||||
from .image import (LoadImage, load_image,
|
||||
@@ -33,7 +33,7 @@ else:
|
||||
_import_structure = {
|
||||
'base': ['Preprocessor'],
|
||||
'builder': ['PREPROCESSORS', 'build_preprocessor'],
|
||||
'common': ['Compose'],
|
||||
'common': ['Compose', 'ToTensor', 'Filter'],
|
||||
'audio': ['LinearAECAndFbank'],
|
||||
'asr': ['WavToScp'],
|
||||
'video': ['ReadVideoData'],
|
||||
|
||||
@@ -2,6 +2,10 @@
|
||||
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from typing import Mapping
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from .builder import PREPROCESSORS, build_preprocessor
|
||||
|
||||
@@ -25,12 +29,18 @@ class Compose(object):
|
||||
if isinstance(transform, dict):
|
||||
if self.field_name is None:
|
||||
transform = build_preprocessor(transform, field_name)
|
||||
self.transforms.append(transform)
|
||||
else:
|
||||
# if not found key in field_name, try field_name=None(default_group)
|
||||
try:
|
||||
transform = build_preprocessor(transform, field_name)
|
||||
except KeyError:
|
||||
transform = build_preprocessor(transform, None)
|
||||
elif callable(transform):
|
||||
self.transforms.append(transform)
|
||||
pass
|
||||
else:
|
||||
raise TypeError('transform must be callable or a dict, but got'
|
||||
f' {type(transform)}')
|
||||
self.transforms.append(transform)
|
||||
|
||||
def __call__(self, data):
|
||||
for t in self.transforms:
|
||||
@@ -52,3 +62,82 @@ class Compose(object):
|
||||
format_string += f'\n {t}'
|
||||
format_string += '\n)'
|
||||
return format_string
|
||||
|
||||
|
||||
def to_tensor(data):
|
||||
"""Convert objects of various python types to :obj:`torch.Tensor`.
|
||||
|
||||
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
|
||||
:class:`Sequence`, :class:`int` and :class:`float`.
|
||||
|
||||
Args:
|
||||
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
|
||||
be converted.
|
||||
"""
|
||||
|
||||
if isinstance(data, torch.Tensor):
|
||||
return data
|
||||
elif isinstance(data, np.ndarray):
|
||||
return torch.from_numpy(data)
|
||||
elif isinstance(data, Sequence) and not isinstance(data, str):
|
||||
return torch.tensor(data)
|
||||
elif isinstance(data, int):
|
||||
return torch.LongTensor([data])
|
||||
elif isinstance(data, float):
|
||||
return torch.FloatTensor([data])
|
||||
else:
|
||||
raise TypeError(f'type {type(data)} cannot be converted to tensor.')
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module()
|
||||
class ToTensor(object):
|
||||
"""Convert target object to tensor.
|
||||
|
||||
Args:
|
||||
keys (Sequence[str]): Key of data to be converted to Tensor.
|
||||
Only valid when data is type of `Mapping`. If `keys` is None,
|
||||
all values of keys will be converted to tensor by default.
|
||||
"""
|
||||
|
||||
def __init__(self, keys=None):
|
||||
self.keys = keys
|
||||
|
||||
def __call__(self, data):
|
||||
if isinstance(data, Mapping):
|
||||
if self.keys is None:
|
||||
self.keys = list(data.keys())
|
||||
|
||||
for key in self.keys:
|
||||
data[key] = to_tensor(data[key])
|
||||
else:
|
||||
data = to_tensor(data)
|
||||
|
||||
return data
|
||||
|
||||
def __repr__(self):
|
||||
return self.__class__.__name__ + f'(keys={self.keys})'
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module()
|
||||
class Filter(object):
|
||||
"""This is usually the last stage of the dataloader transform.
|
||||
Only data of reserved keys will be kept and passed directly to the model, others will be removed.
|
||||
|
||||
Args:
|
||||
keys (Sequence[str]): Keys of data to be reserved, others will be removed.
|
||||
"""
|
||||
|
||||
def __init__(self, reserved_keys):
|
||||
self.reserved_keys = reserved_keys
|
||||
|
||||
def __call__(self, data):
|
||||
assert isinstance(data, Mapping)
|
||||
|
||||
reserved_data = {}
|
||||
for key in self.reserved_keys:
|
||||
reserved_data[key] = data[key]
|
||||
|
||||
return reserved_data
|
||||
|
||||
def __repr__(self):
|
||||
return self.__class__.__name__ + f'(keys={self.reserved_keys})'
|
||||
|
||||
@@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.model_dir: str = model_dir
|
||||
|
||||
from .common import Filter
|
||||
|
||||
# TODO: `Filter` should be moved to configurarion file of each model
|
||||
self._transforms = [Filter(reserved_keys=['input', 'target'])]
|
||||
|
||||
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""process the raw input data
|
||||
|
||||
@@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor):
|
||||
Returns:
|
||||
Dict[str, Any]: the preprocessed data
|
||||
"""
|
||||
for t in self._transforms:
|
||||
data = t(data)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import os.path as osp
|
||||
import uuid
|
||||
from typing import Any, Dict, Iterable, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from modelscope.metainfo import Models, Preprocessors
|
||||
@@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
|
||||
text_b,
|
||||
return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
|
||||
**self.tokenize_kwargs)
|
||||
output = {
|
||||
k: np.array(v) if isinstance(v, list) else v
|
||||
for k, v in output.items()
|
||||
}
|
||||
self.labels_to_id(labels, output)
|
||||
return output
|
||||
|
||||
@@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
|
||||
if labels is not None:
|
||||
if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
|
||||
and self.label2id is not None:
|
||||
output[OutputKeys.LABEL] = [
|
||||
output[OutputKeys.LABELS] = [
|
||||
self.label2id[str(label)] for label in labels
|
||||
]
|
||||
elif label_can_be_mapped(labels) and self.label2id is not None:
|
||||
output[OutputKeys.LABEL] = self.label2id[str(labels)]
|
||||
output[OutputKeys.LABELS] = self.label2id[str(labels)]
|
||||
else:
|
||||
output[OutputKeys.LABEL] = labels
|
||||
output[OutputKeys.LABELS] = labels
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
|
||||
@@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer):
|
||||
|
||||
train_outputs = dict()
|
||||
self._mode = ModeKeys.TRAIN
|
||||
inputs = self.collate_fn(inputs)
|
||||
# call model forward but not __call__ to skip postprocess
|
||||
if isinstance(inputs, Mapping):
|
||||
d_loss = model._train_forward_d(**inputs)
|
||||
|
||||
@@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
|
||||
self.train_keys = build_dataset_keys(
|
||||
self.cfg.dataset.train if hasattr(self.cfg, 'dataset')
|
||||
and hasattr(self.cfg.dataset, 'train') else None)
|
||||
# TODO eval may has special keys, which is now not supported.
|
||||
# because there is only one preprocessor in the trainer, and it only supports one group of keys.
|
||||
self.eval_keys = self.train_keys
|
||||
self.eval_keys = build_dataset_keys(
|
||||
self.cfg.dataset.val if hasattr(self.cfg, 'dataset')
|
||||
and hasattr(self.cfg.dataset, 'val') else None)
|
||||
if len(self.eval_keys) == 0:
|
||||
self.eval_keys = self.train_keys
|
||||
|
||||
super().__init__(
|
||||
model=model_dir,
|
||||
@@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
|
||||
elif isinstance(model, nn.Module):
|
||||
return model
|
||||
|
||||
def build_preprocessor(self) -> Preprocessor:
|
||||
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
|
||||
"""Build the preprocessor.
|
||||
|
||||
User can override this method to implement custom logits.
|
||||
@@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
|
||||
model_args = {} if self.label2id is None else {
|
||||
'label2id': self.label2id
|
||||
}
|
||||
cfg = ConfigDict({
|
||||
**getattr(self.cfg, 'preprocessor'),
|
||||
'model_dir':
|
||||
self.model_dir,
|
||||
**model_args,
|
||||
'mode':
|
||||
ModeKeys.TRAIN,
|
||||
**self.train_keys,
|
||||
})
|
||||
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
|
||||
|
||||
field_name = Tasks.find_field_by_task(self.cfg.task)
|
||||
train_preprocessor, eval_preprocessor = None, None
|
||||
_train_cfg, _eval_cfg = {}, {}
|
||||
|
||||
if 'type' not in self.cfg.preprocessor and (
|
||||
'train' in self.cfg.preprocessor
|
||||
or 'val' in self.cfg.preprocessor):
|
||||
if 'train' in self.cfg.preprocessor:
|
||||
_train_cfg = self.cfg.preprocessor.train
|
||||
if 'val' in self.cfg.preprocessor:
|
||||
_eval_cfg = self.cfg.preprocessor.val
|
||||
else:
|
||||
_train_cfg = self.cfg.preprocessor
|
||||
_eval_cfg = self.cfg.preprocessor
|
||||
|
||||
if len(_train_cfg):
|
||||
_train_cfg.update({
|
||||
'model_dir': self.model_dir,
|
||||
**model_args,
|
||||
**self.train_keys, 'mode': ModeKeys.TRAIN
|
||||
})
|
||||
train_preprocessor = build_preprocessor(_train_cfg, field_name)
|
||||
if len(_eval_cfg):
|
||||
_eval_cfg.update({
|
||||
'model_dir': self.model_dir,
|
||||
**model_args,
|
||||
**self.eval_keys, 'mode': ModeKeys.EVAL
|
||||
})
|
||||
eval_preprocessor = build_preprocessor(_eval_cfg, field_name)
|
||||
|
||||
return train_preprocessor, eval_preprocessor
|
||||
|
||||
|
||||
@TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer)
|
||||
|
||||
@@ -5,15 +5,15 @@ import time
|
||||
from collections.abc import Mapping
|
||||
from distutils.version import LooseVersion
|
||||
from functools import partial
|
||||
from typing import Callable, List, Optional, Tuple, Union
|
||||
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
from addict import Dict
|
||||
from torch import distributed as dist
|
||||
from torch import nn
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
from torch.utils.data.dataloader import default_collate
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
@@ -21,8 +21,9 @@ from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics import build_metric, task_default_metrics
|
||||
from modelscope.models.base import Model, TorchModel
|
||||
from modelscope.msdatasets.ms_dataset import MsDataset
|
||||
from modelscope.preprocessors import build_preprocessor
|
||||
from modelscope.preprocessors.base import Preprocessor
|
||||
from modelscope.preprocessors.builder import build_preprocessor
|
||||
from modelscope.preprocessors.common import Compose
|
||||
from modelscope.task_datasets.builder import build_task_dataset
|
||||
from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
|
||||
from modelscope.trainers.hooks.builder import HOOKS
|
||||
@@ -30,14 +31,15 @@ from modelscope.trainers.hooks.priority import Priority, get_priority
|
||||
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
|
||||
from modelscope.trainers.optimizer.builder import build_optimizer
|
||||
from modelscope.utils.config import Config, ConfigDict
|
||||
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys,
|
||||
ModelFile, Tasks, TrainerStages)
|
||||
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
|
||||
ConfigKeys, Hubs, ModeKeys, ModelFile,
|
||||
Tasks, TrainerStages)
|
||||
from modelscope.utils.data_utils import to_device
|
||||
from modelscope.utils.file_utils import func_receive_dict_inputs
|
||||
from modelscope.utils.logger import get_logger
|
||||
from modelscope.utils.registry import build_from_cfg
|
||||
from modelscope.utils.tensor_utils import torch_default_data_collator
|
||||
from modelscope.utils.torch_utils import (broadcast, create_device,
|
||||
get_dist_info, init_dist)
|
||||
from modelscope.utils.torch_utils import (create_device, get_dist_info,
|
||||
init_dist)
|
||||
from .base import BaseTrainer
|
||||
from .builder import TRAINERS
|
||||
from .default_config import DEFAULT_CONFIG
|
||||
@@ -83,7 +85,8 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
data_collator: Optional[Callable] = None,
|
||||
train_dataset: Optional[Union[MsDataset, Dataset]] = None,
|
||||
eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
|
||||
preprocessor: Optional[Preprocessor] = None,
|
||||
preprocessor: Optional[Union[Preprocessor,
|
||||
Dict[str, Preprocessor]]] = None,
|
||||
optimizers: Tuple[torch.optim.Optimizer,
|
||||
torch.optim.lr_scheduler._LRScheduler] = (None,
|
||||
None),
|
||||
@@ -120,24 +123,46 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
else:
|
||||
self.work_dir = self.cfg.train.get('work_dir', './work_dir')
|
||||
|
||||
self.preprocessor = None
|
||||
self.train_preprocessor, self.eval_preprocessor = None, None
|
||||
if isinstance(preprocessor, Preprocessor):
|
||||
self.preprocessor = preprocessor
|
||||
elif hasattr(self.cfg, 'preprocessor'):
|
||||
self.preprocessor = self.build_preprocessor()
|
||||
if self.preprocessor is not None:
|
||||
self.preprocessor.mode = ModeKeys.TRAIN
|
||||
self.train_preprocessor = preprocessor
|
||||
self.eval_preprocessor = preprocessor
|
||||
elif isinstance(preprocessor, Mapping):
|
||||
if not (ConfigKeys.train in preprocessor
|
||||
or ConfigKeys.val in preprocessor):
|
||||
raise ValueError(
|
||||
f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
|
||||
)
|
||||
if ConfigKeys.train in preprocessor:
|
||||
assert isinstance(preprocessor[ConfigKeys.train], Preprocessor)
|
||||
self.train_preprocessor = preprocessor[ConfigKeys.train]
|
||||
if ConfigKeys.val in preprocessor:
|
||||
assert isinstance(preprocessor[ConfigKeys.val], Preprocessor)
|
||||
self.eval_preprocessor = preprocessor[ConfigKeys.val]
|
||||
elif hasattr(self.cfg, ConfigFields.preprocessor):
|
||||
self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor(
|
||||
)
|
||||
|
||||
if self.train_preprocessor is not None:
|
||||
self.train_preprocessor.mode = ModeKeys.TRAIN
|
||||
if self.eval_preprocessor is not None:
|
||||
self.eval_preprocessor.mode = ModeKeys.EVAL
|
||||
|
||||
device_name = kwargs.get('device', 'gpu')
|
||||
assert device_name in ['gpu',
|
||||
'cpu'], 'device should be either cpu or gpu.'
|
||||
self.device = create_device(device_name == 'cpu')
|
||||
|
||||
self.train_dataset = self.to_task_dataset(
|
||||
train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor)
|
||||
train_dataset,
|
||||
mode=ModeKeys.TRAIN,
|
||||
preprocessor=self.train_preprocessor)
|
||||
self.eval_dataset = self.to_task_dataset(
|
||||
eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor)
|
||||
eval_dataset,
|
||||
mode=ModeKeys.EVAL,
|
||||
preprocessor=self.eval_preprocessor)
|
||||
|
||||
self.data_collator = data_collator if data_collator is not None else torch_default_data_collator
|
||||
self.data_collator = data_collator if data_collator is not None else default_collate
|
||||
self.metrics = self.get_metrics()
|
||||
self._metric_values = None
|
||||
self.optimizers = optimizers
|
||||
@@ -229,12 +254,12 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
return datasets
|
||||
elif isinstance(datasets, MsDataset):
|
||||
datasets = datasets.to_torch_dataset(
|
||||
preprocessors=self.preprocessor)
|
||||
preprocessors=preprocessor)
|
||||
return datasets
|
||||
elif isinstance(datasets, List) and isinstance(
|
||||
datasets[0], MsDataset):
|
||||
datasets = [
|
||||
d.to_torch_dataset(preprocessor=self.preprocessor)
|
||||
d.to_torch_dataset(preprocessor=preprocessor)
|
||||
for d in datasets
|
||||
]
|
||||
cfg = ConfigDict(
|
||||
@@ -258,24 +283,44 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
else:
|
||||
return datasets
|
||||
|
||||
def build_preprocessor(self) -> Preprocessor:
|
||||
"""Build the preprocessor.
|
||||
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
|
||||
"""Build train and eval preprocessor.
|
||||
|
||||
User can override this method to implement custom logits.
|
||||
|
||||
Returns: The preprocessor instance.
|
||||
Returns: The train preprocessor and eval preprocessor instance.
|
||||
|
||||
"""
|
||||
# TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor
|
||||
# when they are different ones in training and evaluation
|
||||
cfg = ConfigDict({
|
||||
**getattr(self.cfg, 'preprocessor'),
|
||||
'model_dir':
|
||||
self.model_dir,
|
||||
'mode':
|
||||
ModeKeys.TRAIN,
|
||||
})
|
||||
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
|
||||
field_name = Tasks.find_field_by_task(self.cfg.task)
|
||||
train_preprocessor, eval_preprocessor = None, None
|
||||
_train_cfg, _eval_cfg = {}, {}
|
||||
_dafault_args = {'model_dir': self.model_dir}
|
||||
|
||||
if 'type' not in self.cfg.preprocessor and (
|
||||
'train' in self.cfg.preprocessor
|
||||
or 'val' in self.cfg.preprocessor):
|
||||
if 'train' in self.cfg.preprocessor:
|
||||
_train_cfg = self.cfg.preprocessor.train
|
||||
if 'val' in self.cfg.preprocessor:
|
||||
_eval_cfg = self.cfg.preprocessor.val
|
||||
else:
|
||||
_train_cfg = self.cfg.preprocessor
|
||||
_eval_cfg = self.cfg.preprocessor
|
||||
|
||||
if len(_train_cfg):
|
||||
if isinstance(_train_cfg, Sequence):
|
||||
# TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
|
||||
# and add mode for Compose or other plans
|
||||
raise NotImplementedError('Not supported yet!')
|
||||
_train_cfg.update(_dafault_args)
|
||||
train_preprocessor = build_preprocessor(_train_cfg, field_name)
|
||||
if len(_eval_cfg):
|
||||
if isinstance(_eval_cfg, Sequence):
|
||||
raise NotImplementedError('Not supported yet!')
|
||||
_eval_cfg.update(_dafault_args)
|
||||
eval_preprocessor = build_preprocessor(_eval_cfg, field_name)
|
||||
|
||||
return train_preprocessor, eval_preprocessor
|
||||
|
||||
def get_metrics(self) -> List[str]:
|
||||
"""Get the metric class types.
|
||||
@@ -373,34 +418,6 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
|
||||
return build_parallel(dp_cfg)
|
||||
|
||||
def collate_fn(self, data):
|
||||
"""Prepare the input just before the forward function.
|
||||
This method will move the tensors to the right device.
|
||||
Usually this method does not need to be overridden.
|
||||
|
||||
Args:
|
||||
data: The data out of the dataloader.
|
||||
|
||||
Returns: The processed data.
|
||||
|
||||
"""
|
||||
from torch.utils.data.dataloader import default_collate
|
||||
if isinstance(data, dict) or isinstance(data, Mapping):
|
||||
return type(data)({k: self.collate_fn(v) for k, v in data.items()})
|
||||
elif isinstance(data, (tuple, list)):
|
||||
if isinstance(data[0], (int, float)):
|
||||
return default_collate(data).to(self.device)
|
||||
else:
|
||||
return type(data)(self.collate_fn(v) for v in data)
|
||||
elif isinstance(data, np.ndarray):
|
||||
return self.collate_fn(torch.from_numpy(data))
|
||||
elif isinstance(data, torch.Tensor):
|
||||
return data.to(self.device)
|
||||
elif isinstance(data, (str, int, float, bool)):
|
||||
return data
|
||||
else:
|
||||
raise ValueError(f'Unsupported data type {type(data)}')
|
||||
|
||||
def train_step(self, model, inputs):
|
||||
""" Perform a training step on a batch of inputs.
|
||||
|
||||
@@ -421,7 +438,6 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
# TODO: find more pretty way to change mode
|
||||
model.train()
|
||||
self._mode = ModeKeys.TRAIN
|
||||
inputs = self.collate_fn(inputs)
|
||||
# call model forward but not __call__ to skip postprocess
|
||||
if isinstance(inputs,
|
||||
Mapping) and not func_receive_dict_inputs(model.forward):
|
||||
@@ -486,7 +502,9 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
if self.train_dataset is None:
|
||||
train_data = self.cfg.dataset.train
|
||||
self.train_dataset = self.build_dataset(
|
||||
train_data, mode=ModeKeys.TRAIN)
|
||||
train_data,
|
||||
mode=ModeKeys.TRAIN,
|
||||
preprocessor=self.train_preprocessor)
|
||||
|
||||
data_loader = self._build_dataloader_with_dataset(
|
||||
self.train_dataset,
|
||||
@@ -505,7 +523,9 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
if self.eval_dataset is None:
|
||||
val_data = self.cfg.dataset.val
|
||||
self.eval_dataset = self.build_dataset(
|
||||
val_data, mode=ModeKeys.EVAL)
|
||||
val_data,
|
||||
mode=ModeKeys.EVAL,
|
||||
preprocessor=self.eval_preprocessor)
|
||||
|
||||
batch_size = self.cfg.evaluation.batch_size
|
||||
workers = self.cfg.evaluation.workers
|
||||
@@ -521,7 +541,7 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
)
|
||||
return data_loader
|
||||
|
||||
def build_dataset(self, data_cfg, mode):
|
||||
def build_dataset(self, data_cfg, mode, preprocessor=None):
|
||||
""" Build torch dataset object using data config
|
||||
"""
|
||||
dataset = MsDataset.load(
|
||||
@@ -531,8 +551,7 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
data_cfg, 'subset_name') else None,
|
||||
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
|
||||
)
|
||||
torch_dataset = dataset.to_torch_dataset(
|
||||
preprocessors=self.preprocessor, )
|
||||
torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor)
|
||||
dataset = self.to_task_dataset(torch_dataset, mode)
|
||||
return dataset
|
||||
|
||||
@@ -698,6 +717,7 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
self.invoke_hook(TrainerStages.before_train_epoch)
|
||||
time.sleep(2) # Prevent possible deadlock during epoch transition
|
||||
for i, data_batch in enumerate(data_loader):
|
||||
data_batch = to_device(data_batch, self.device)
|
||||
self.data_batch = data_batch
|
||||
self._inner_iter = i
|
||||
self.invoke_hook(TrainerStages.before_train_iter)
|
||||
@@ -721,16 +741,16 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
metric_values = multi_gpu_test(
|
||||
self.model,
|
||||
data_loader,
|
||||
device=self.device,
|
||||
tmpdir=None,
|
||||
gpu_collect=False,
|
||||
data_collate_fn=self.collate_fn,
|
||||
metric_classes=metric_classes)
|
||||
else:
|
||||
from modelscope.trainers.utils.inference import single_gpu_test
|
||||
metric_values = single_gpu_test(
|
||||
self.model,
|
||||
data_loader,
|
||||
data_collate_fn=self.collate_fn,
|
||||
device=self.device,
|
||||
metric_classes=metric_classes)
|
||||
|
||||
return metric_values
|
||||
|
||||
@@ -10,21 +10,19 @@ import torch
|
||||
from torch import distributed as dist
|
||||
from tqdm import tqdm
|
||||
|
||||
from modelscope.utils.data_utils import to_device
|
||||
from modelscope.utils.file_utils import func_receive_dict_inputs
|
||||
from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
|
||||
make_tmp_dir)
|
||||
|
||||
|
||||
def single_gpu_test(model,
|
||||
data_loader,
|
||||
data_collate_fn=None,
|
||||
metric_classes=None):
|
||||
def single_gpu_test(model, data_loader, device, metric_classes=None):
|
||||
"""Test model with a single gpu.
|
||||
|
||||
Args:
|
||||
model (nn.Module): Model to be tested.
|
||||
data_loader (nn.Dataloader): Pytorch data loader.
|
||||
data_collate_fn: An optional data_collate_fn before fed into the model
|
||||
device: (str | torch.device): The target device for the data.
|
||||
metric_classes(List): List of Metric class that uses to collect metrics
|
||||
|
||||
Returns:
|
||||
@@ -34,8 +32,7 @@ def single_gpu_test(model,
|
||||
dataset = data_loader.dataset
|
||||
with tqdm(total=len(dataset), desc='test samples') as pbar:
|
||||
for data in data_loader:
|
||||
if data_collate_fn is not None:
|
||||
data = data_collate_fn(data)
|
||||
data = to_device(data, device)
|
||||
with torch.no_grad():
|
||||
if isinstance(data, Mapping) and not func_receive_dict_inputs(
|
||||
model.forward):
|
||||
@@ -62,9 +59,9 @@ def single_gpu_test(model,
|
||||
|
||||
def multi_gpu_test(model,
|
||||
data_loader,
|
||||
device,
|
||||
tmpdir=None,
|
||||
gpu_collect=False,
|
||||
data_collate_fn=None,
|
||||
metric_classes=None):
|
||||
"""Test model with multiple gpus.
|
||||
|
||||
@@ -77,10 +74,10 @@ def multi_gpu_test(model,
|
||||
Args:
|
||||
model (nn.Module): Model to be tested.
|
||||
data_loader (nn.Dataloader): Pytorch data loader.
|
||||
device: (str | torch.device): The target device for the data.
|
||||
tmpdir (str): Path of directory to save the temporary results from
|
||||
different gpus under cpu mode.
|
||||
gpu_collect (bool): Option to use either gpu or cpu to collect results.
|
||||
data_collate_fn: An optional data_collate_fn before fed into the model
|
||||
metric_classes(List): List of Metric class that uses to collect metrics
|
||||
|
||||
Returns:
|
||||
@@ -98,8 +95,7 @@ def multi_gpu_test(model,
|
||||
count = 0
|
||||
with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar:
|
||||
for _, data in enumerate(data_loader):
|
||||
if data_collate_fn is not None:
|
||||
data = data_collate_fn(data)
|
||||
data = to_device(data, device)
|
||||
data_list.append(data)
|
||||
with torch.no_grad():
|
||||
if isinstance(data, Mapping) and not func_receive_dict_inputs(
|
||||
|
||||
@@ -219,6 +219,12 @@ class ConfigFields(object):
|
||||
evaluation = 'evaluation'
|
||||
|
||||
|
||||
class ConfigKeys(object):
|
||||
"""Fixed keywords in configuration file"""
|
||||
train = 'train'
|
||||
val = 'val'
|
||||
|
||||
|
||||
class Requirements(object):
|
||||
"""Requirement names for each module
|
||||
"""
|
||||
|
||||
23
modelscope/utils/data_utils.py
Normal file
23
modelscope/utils/data_utils.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from collections.abc import Mapping
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def to_device(batch, device, non_blocking=False):
|
||||
"""Put the data to the target cuda device just before the forward function.
|
||||
Args:
|
||||
batch: The batch data out of the dataloader.
|
||||
device: (str | torch.device): The target device for the data.
|
||||
|
||||
Returns: The data to the target device.
|
||||
|
||||
"""
|
||||
if isinstance(batch, dict) or isinstance(batch, Mapping):
|
||||
return type(batch)({k: to_device(v, device) for k, v in batch.items()})
|
||||
elif isinstance(batch, (tuple, list)):
|
||||
return type(batch)(to_device(v, device) for v in batch)
|
||||
elif isinstance(batch, torch.Tensor):
|
||||
return batch.to(device, non_blocking=non_blocking)
|
||||
else:
|
||||
return batch
|
||||
@@ -24,65 +24,3 @@ def torch_nested_detach(tensors):
|
||||
if isinstance(tensors, torch.Tensor):
|
||||
return tensors.detach()
|
||||
return tensors
|
||||
|
||||
|
||||
def torch_default_data_collator(features):
|
||||
# TODO @jiangnana.jnn refine this default data collator
|
||||
import torch
|
||||
first = features[0]
|
||||
|
||||
if isinstance(first, Mapping):
|
||||
batch = {}
|
||||
# Special handling for labels.
|
||||
# Ensure that tensor is created with the correct type
|
||||
# (it should be automatically the case, but let's make sure of it.)
|
||||
if 'label' in first and first['label'] is not None:
|
||||
label = first['label'].item() if isinstance(
|
||||
first['label'], torch.Tensor) else first['label']
|
||||
# the msdataset return a 0-dimension np.array with a single value, the following part handle this.
|
||||
if isinstance(label, np.ndarray):
|
||||
src_dtype = label[()].dtype
|
||||
dtype = torch.long if label[(
|
||||
)].dtype == np.int64 else torch.float
|
||||
else:
|
||||
src_dtype = type(label)
|
||||
dtype = torch.long if isinstance(label, int) else torch.float
|
||||
# add dtype to np.array to fix "TypeError: can't convert np.ndarray of type numpy.object_"
|
||||
batch['labels'] = torch.tensor(
|
||||
np.array([f['label'] for f in features], dtype=src_dtype),
|
||||
dtype=dtype)
|
||||
elif 'label_ids' in first and first['label_ids'] is not None:
|
||||
if isinstance(first['label_ids'], torch.Tensor):
|
||||
batch['labels'] = torch.stack(
|
||||
[f['label_ids'] for f in features])
|
||||
else:
|
||||
dtype = torch.long if type(
|
||||
first['label_ids'][0]) is int else torch.float
|
||||
batch['labels'] = torch.tensor(
|
||||
[f['label_ids'] for f in features], dtype=dtype)
|
||||
|
||||
# Handling of all other possible keys.
|
||||
# Again, we will use the first element to figure out which key/values are not None for this model.
|
||||
for k, v in first.items():
|
||||
if k not in ('label', 'label_ids'
|
||||
) and v is not None and not isinstance(v, str):
|
||||
if isinstance(v, torch.Tensor):
|
||||
batch[k] = torch.stack([f[k] for f in features])
|
||||
elif isinstance(v, list) and isinstance(v[0], torch.Tensor):
|
||||
batch[k] = torch.stack([d for f in features for d in f[k]])
|
||||
else:
|
||||
batch[k] = torch.tensor(np.array([f[k] for f in features]))
|
||||
elif isinstance(first, tuple):
|
||||
batch = []
|
||||
for idx in range(len(first)):
|
||||
if isinstance(first[idx], torch.Tensor):
|
||||
batch.append(torch.stack([f[idx] for f in features]))
|
||||
else:
|
||||
batch.append(torch.tensor([f[idx] for f in features]))
|
||||
else:
|
||||
if isinstance(first, torch.Tensor):
|
||||
batch = torch.stack(features)
|
||||
else:
|
||||
batch = torch.tensor(features)
|
||||
|
||||
return batch
|
||||
|
||||
@@ -50,7 +50,7 @@ def set_test_level(level: int):
|
||||
|
||||
def create_dummy_test_dataset(feat, label, num):
|
||||
return MsDataset.from_hf_dataset(
|
||||
Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num)))
|
||||
Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num)))
|
||||
|
||||
|
||||
def download_and_untar(fpath, furl, dst) -> str:
|
||||
|
||||
@@ -2,7 +2,10 @@
|
||||
|
||||
import unittest
|
||||
|
||||
from modelscope.preprocessors import PREPROCESSORS, Compose, Preprocessor
|
||||
import torch
|
||||
|
||||
from modelscope.preprocessors import (PREPROCESSORS, Compose, Filter,
|
||||
Preprocessor, ToTensor)
|
||||
|
||||
|
||||
class ComposeTest(unittest.TestCase):
|
||||
@@ -35,5 +38,27 @@ class ComposeTest(unittest.TestCase):
|
||||
self.assertEqual(output['tmp2'], 'tmp2')
|
||||
|
||||
|
||||
class ToTensorTest(unittest.TestCase):
|
||||
|
||||
def test_totensor(self):
|
||||
to_tensor_op = ToTensor(keys=['img'])
|
||||
inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'}
|
||||
inputs = to_tensor_op(inputs)
|
||||
self.assertIsInstance(inputs['img'], torch.Tensor)
|
||||
self.assertEqual(inputs['label'], 1)
|
||||
self.assertEqual(inputs['path'], 'test.jpg')
|
||||
|
||||
|
||||
class FilterTest(unittest.TestCase):
|
||||
|
||||
def test_filter(self):
|
||||
filter_op = Filter(reserved_keys=['img', 'label'])
|
||||
inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'}
|
||||
inputs = filter_op(inputs)
|
||||
self.assertIn('img', inputs)
|
||||
self.assertIn('label', inputs)
|
||||
self.assertNotIn('path', inputs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -12,7 +12,7 @@ from torch import nn
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics.builder import METRICS, MetricKeys
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.utils.constant import LogKeys, ModelFile
|
||||
from modelscope.utils.constant import ModelFile
|
||||
from modelscope.utils.registry import default_group
|
||||
from modelscope.utils.test_utils import create_dummy_test_dataset
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.optim import SGD
|
||||
from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau
|
||||
from torch.optim.lr_scheduler import MultiStepLR
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.metrics.builder import METRICS, MetricKeys
|
||||
@@ -96,7 +96,8 @@ class LrSchedulerHookTest(unittest.TestCase):
|
||||
model=model,
|
||||
train_dataset=dummy_dataset,
|
||||
optimizers=(optimizer, lr_scheduler),
|
||||
max_epochs=5)
|
||||
max_epochs=5,
|
||||
device='cpu')
|
||||
|
||||
trainer = build_trainer(trainer_name, kwargs)
|
||||
train_dataloader = trainer._build_dataloader_with_dataset(
|
||||
@@ -160,15 +161,13 @@ class LrSchedulerHookTest(unittest.TestCase):
|
||||
json.dump(json_cfg, f)
|
||||
|
||||
model = DummyModel()
|
||||
# optimmizer = SGD(model.parameters(), lr=0.01)
|
||||
# lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4])
|
||||
trainer_name = Trainers.default
|
||||
kwargs = dict(
|
||||
cfg_file=config_path,
|
||||
model=model,
|
||||
train_dataset=dummy_dataset,
|
||||
# optimizers=(optimmizer, lr_scheduler),
|
||||
max_epochs=7)
|
||||
max_epochs=7,
|
||||
device='cpu')
|
||||
|
||||
trainer = build_trainer(trainer_name, kwargs)
|
||||
train_dataloader = trainer._build_dataloader_with_dataset(
|
||||
@@ -266,7 +265,8 @@ class PlateauLrSchedulerHookTest(unittest.TestCase):
|
||||
train_dataset=dummy_dataset,
|
||||
eval_dataset=dummy_dataset,
|
||||
optimizers=(optimizer, None),
|
||||
max_epochs=5)
|
||||
max_epochs=5,
|
||||
device='cpu')
|
||||
|
||||
trainer = build_trainer(trainer_name, kwargs)
|
||||
train_dataloader = trainer._build_dataloader_with_dataset(
|
||||
|
||||
@@ -17,7 +17,7 @@ from modelscope.utils.constant import ModelFile, TrainerStages
|
||||
from modelscope.utils.test_utils import create_dummy_test_dataset
|
||||
|
||||
dummy_dataset = create_dummy_test_dataset(
|
||||
np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10)
|
||||
np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10)
|
||||
|
||||
|
||||
class DummyModel(nn.Module):
|
||||
@@ -71,7 +71,8 @@ class OptimizerHookTest(unittest.TestCase):
|
||||
model=model,
|
||||
train_dataset=dummy_dataset,
|
||||
optimizers=(optimizer, lr_scheduler),
|
||||
max_epochs=2)
|
||||
max_epochs=2,
|
||||
device='cpu')
|
||||
|
||||
trainer = build_trainer(trainer_name, kwargs)
|
||||
train_dataloader = trainer._build_dataloader_with_dataset(
|
||||
|
||||
@@ -75,7 +75,8 @@ class IterTimerHookTest(unittest.TestCase):
|
||||
model=model,
|
||||
train_dataset=dummy_dataset,
|
||||
optimizers=(optimizer, lr_scheduler),
|
||||
max_epochs=5)
|
||||
max_epochs=5,
|
||||
device='cpu')
|
||||
|
||||
trainer = build_trainer(trainer_name, kwargs)
|
||||
train_dataloader = trainer._build_dataloader_with_dataset(
|
||||
|
||||
@@ -3,19 +3,16 @@ import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from abc import ABCMeta
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
from datasets import Dataset
|
||||
from torch import nn
|
||||
from torch.optim import SGD
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
|
||||
from modelscope.metainfo import Metrics, Trainers
|
||||
from modelscope.metrics.builder import MetricKeys
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
|
||||
from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
|
||||
@@ -116,7 +113,8 @@ class TrainerTest(unittest.TestCase):
|
||||
data_collator=None,
|
||||
train_dataset=dummy_dataset_small,
|
||||
eval_dataset=dummy_dataset_small,
|
||||
max_epochs=3)
|
||||
max_epochs=3,
|
||||
device='cpu')
|
||||
|
||||
trainer = build_trainer(trainer_name, kwargs)
|
||||
trainer.train()
|
||||
@@ -175,7 +173,8 @@ class TrainerTest(unittest.TestCase):
|
||||
train_dataset=dummy_dataset_small,
|
||||
eval_dataset=dummy_dataset_small,
|
||||
optimizers=(optimmizer, lr_scheduler),
|
||||
max_epochs=3)
|
||||
max_epochs=3,
|
||||
device='cpu')
|
||||
|
||||
trainer = build_trainer(trainer_name, kwargs)
|
||||
trainer.train()
|
||||
@@ -225,7 +224,8 @@ class TrainerTest(unittest.TestCase):
|
||||
train_dataset=dummy_dataset_big,
|
||||
eval_dataset=dummy_dataset_small,
|
||||
optimizers=(optimmizer, lr_scheduler),
|
||||
max_epochs=3)
|
||||
max_epochs=3,
|
||||
device='cpu')
|
||||
|
||||
trainer = build_trainer(trainer_name, kwargs)
|
||||
trainer.train()
|
||||
|
||||
@@ -37,7 +37,8 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
model=model_id,
|
||||
train_dataset=self.dataset,
|
||||
eval_dataset=self.dataset,
|
||||
work_dir=self.tmp_dir)
|
||||
work_dir=self.tmp_dir,
|
||||
model_revision='beta')
|
||||
|
||||
trainer = build_trainer(default_args=kwargs)
|
||||
trainer.train()
|
||||
@@ -53,7 +54,8 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
model=model_id,
|
||||
train_dataset=self.dataset,
|
||||
eval_dataset=self.dataset,
|
||||
work_dir=self.tmp_dir)
|
||||
work_dir=self.tmp_dir,
|
||||
model_revision='beta')
|
||||
|
||||
trainer = build_trainer(default_args=kwargs)
|
||||
trainer.train()
|
||||
@@ -69,7 +71,7 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_trainer_with_user_defined_config(self):
|
||||
model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
|
||||
cfg = read_config(model_id)
|
||||
cfg = read_config(model_id, revision='beta')
|
||||
cfg.train.max_epochs = 20
|
||||
cfg.train.work_dir = self.tmp_dir
|
||||
cfg_file = os.path.join(self.tmp_dir, 'config.json')
|
||||
@@ -78,7 +80,8 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
model=model_id,
|
||||
train_dataset=self.dataset,
|
||||
eval_dataset=self.dataset,
|
||||
cfg_file=cfg_file)
|
||||
cfg_file=cfg_file,
|
||||
model_revision='beta')
|
||||
|
||||
trainer = build_trainer(default_args=kwargs)
|
||||
trainer.train()
|
||||
@@ -98,7 +101,7 @@ class TestTrainerWithNlp(unittest.TestCase):
|
||||
os.makedirs(tmp_dir)
|
||||
|
||||
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
|
||||
cache_path = snapshot_download(model_id)
|
||||
cache_path = snapshot_download(model_id, revision='beta')
|
||||
model = SbertForSequenceClassification.from_pretrained(cache_path)
|
||||
kwargs = dict(
|
||||
cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
|
||||
|
||||
0
tests/trainers/utils/__init__.py
Normal file
0
tests/trainers/utils/__init__.py
Normal file
116
tests/trainers/utils/test_inference.py
Normal file
116
tests/trainers/utils/test_inference.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from modelscope.metrics.builder import MetricKeys
|
||||
from modelscope.metrics.sequence_classification_metric import \
|
||||
SequenceClassificationMetric
|
||||
from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test
|
||||
from modelscope.utils.test_utils import (DistributedTestCase,
|
||||
create_dummy_test_dataset, test_level)
|
||||
from modelscope.utils.torch_utils import get_dist_info, init_dist
|
||||
|
||||
dummy_dataset = create_dummy_test_dataset(
|
||||
torch.rand((5, )), torch.randint(0, 4, (1, )), 20)
|
||||
|
||||
|
||||
class DummyModel(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.linear = nn.Linear(5, 4)
|
||||
self.bn = nn.BatchNorm1d(4)
|
||||
|
||||
def forward(self, feat, labels):
|
||||
x = self.linear(feat)
|
||||
|
||||
x = self.bn(x)
|
||||
loss = torch.sum(x)
|
||||
return dict(logits=x, loss=loss)
|
||||
|
||||
|
||||
def test_func(dist=False):
|
||||
dummy_model = DummyModel()
|
||||
dataset = dummy_dataset.to_torch_dataset()
|
||||
|
||||
dummy_loader = DataLoader(
|
||||
dataset,
|
||||
batch_size=2,
|
||||
)
|
||||
|
||||
metric_class = SequenceClassificationMetric()
|
||||
|
||||
if dist:
|
||||
init_dist(launcher='pytorch')
|
||||
|
||||
rank, world_size = get_dist_info()
|
||||
device = torch.device(f'cuda:{rank}')
|
||||
dummy_model.cuda()
|
||||
|
||||
if world_size > 1:
|
||||
from torch.nn.parallel.distributed import DistributedDataParallel
|
||||
dummy_model = DistributedDataParallel(
|
||||
dummy_model, device_ids=[torch.cuda.current_device()])
|
||||
test_func = multi_gpu_test
|
||||
else:
|
||||
test_func = single_gpu_test
|
||||
|
||||
metric_results = test_func(
|
||||
dummy_model,
|
||||
dummy_loader,
|
||||
device=device,
|
||||
metric_classes=[metric_class])
|
||||
|
||||
return metric_results
|
||||
|
||||
|
||||
@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
|
||||
class SingleGpuTestTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
|
||||
self.tmp_dir = tempfile.TemporaryDirectory().name
|
||||
if not os.path.exists(self.tmp_dir):
|
||||
os.makedirs(self.tmp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
shutil.rmtree(self.tmp_dir)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_single_gpu_test(self):
|
||||
metric_results = test_func()
|
||||
self.assertIn(MetricKeys.ACCURACY, metric_results)
|
||||
|
||||
|
||||
@unittest.skipIf(not torch.cuda.is_available()
|
||||
or torch.cuda.device_count() <= 1, 'distributed unittest')
|
||||
class MultiGpuTestTest(DistributedTestCase):
|
||||
|
||||
def setUp(self):
|
||||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
|
||||
self.tmp_dir = tempfile.TemporaryDirectory().name
|
||||
if not os.path.exists(self.tmp_dir):
|
||||
os.makedirs(self.tmp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
shutil.rmtree(self.tmp_dir)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_multi_gpu_test(self):
|
||||
self.start(
|
||||
test_func,
|
||||
num_gpus=2,
|
||||
assert_callback=lambda x: self.assertIn(MetricKeys.ACCURACY, x),
|
||||
dist=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user