From bf3a2b6c090ab1a65800cc534a567106f012f1a7 Mon Sep 17 00:00:00 2001 From: "zeyinzi.jzyz" Date: Wed, 8 Mar 2023 16:42:23 +0800 Subject: [PATCH] support vision efficient tuning finetune MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 查看改动点 ↓↓↓ ### vision efficient tuning finetune - Model模块改造成适配训练的 - Model模块在支持训练同时向下兼容之前发布的modecard - Pipline兼容modelcard加载的preprocessor或直接定义的 - 添加 ImageClassificationPreprocessor (非mmcv版本) - 添加 VisionEfficientTuningTrainer - ~~添加 opencv_transforms==0.0.6~~ (以源代码引入必要) ### Modelcard - test pipeline和trainer合并到一起 - 新增3个模型的test - 新增demo service ### 公共组件 - ms_dataset.py: fix warning, [UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or xxx] - preprocessor添加common:ToNumpy、Rename、Identity - preprocessor common对于dict进行key判断再取值。 - ~~修复learning rate在iter级别变化的逻辑。~~ (本次不做了) - ~~修复非dist状态下train data没有进行shuffle的bug。~~ (Master已有人改了) - 修复训练时调用util中非cv包的异常 zhconv。 ### 其他 - 为防止新引入的preprocessor模块在config中被原代码加载,导致在其他人做CI时会报错;所以暂时没有添加新的tag,等CR完成后,会进行打tag再rerun CI。 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11762108 * support vision efficient tuning finetune * update test case * update shuffle on IterableDataset * update bitfit & sidetuning * compatible with base trainer --- .../vision_efficient_tuning_test_apple.jpg | 3 + ...vision_efficient_tuning_test_sunflower.jpg | 3 + modelscope/metainfo.py | 2 + .../cv/vision_efficient_tuning/__init__.py | 11 +- .../cv/vision_efficient_tuning/backbone.py | 153 +++-- .../cv/vision_efficient_tuning/model.py | 49 ++ .../models/cv/vision_efficient_tuning/petl.py | 100 ++++ .../vision_efficient_tuning.py | 159 +++-- modelscope/msdatasets/ms_dataset.py | 2 +- .../cv/vision_efficient_tuning_pipeline.py | 60 +- modelscope/preprocessors/common.py | 95 ++- modelscope/preprocessors/cv/__init__.py | 3 + modelscope/preprocessors/cv/cv2_transforms.py | 559 ++++++++++++++++++ .../cv/image_classification_preprocessor.py | 340 +++++++++++ modelscope/preprocessors/image.py | 39 +- modelscope/trainers/cv/__init__.py | 2 + .../cv/vision_efficient_tuning_trainer.py | 114 ++++ modelscope/utils/chinese_utils.py | 4 +- modelscope/utils/demo_utils.py | 1 + .../pipelines/test_vision_efficient_tuning.py | 154 +++++ .../test_vision_efficient_tuning_adapter.py | 37 -- .../test_vision_efficient_tuning_lora.py | 36 -- .../test_vision_efficient_tuning_prefix.py | 37 -- .../test_vision_efficient_tuning_prompt.py | 37 -- .../test_finetune_vision_efficient_tuning.py | 355 +++++++++++ 25 files changed, 2096 insertions(+), 259 deletions(-) create mode 100644 data/test/images/vision_efficient_tuning_test_apple.jpg create mode 100644 data/test/images/vision_efficient_tuning_test_sunflower.jpg create mode 100644 modelscope/models/cv/vision_efficient_tuning/model.py create mode 100644 modelscope/preprocessors/cv/cv2_transforms.py create mode 100644 modelscope/preprocessors/cv/image_classification_preprocessor.py create mode 100644 modelscope/trainers/cv/vision_efficient_tuning_trainer.py create mode 100644 tests/pipelines/test_vision_efficient_tuning.py delete mode 100644 tests/pipelines/test_vision_efficient_tuning_adapter.py delete mode 100644 tests/pipelines/test_vision_efficient_tuning_lora.py delete mode 100644 tests/pipelines/test_vision_efficient_tuning_prefix.py delete mode 100644 tests/pipelines/test_vision_efficient_tuning_prompt.py create mode 100644 tests/trainers/test_finetune_vision_efficient_tuning.py diff --git a/data/test/images/vision_efficient_tuning_test_apple.jpg b/data/test/images/vision_efficient_tuning_test_apple.jpg new file mode 100644 index 00000000..7da7fcab --- /dev/null +++ b/data/test/images/vision_efficient_tuning_test_apple.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:407d70db9f01bc7a6f34377e36c3f2f5eefdfca8bd3c578226bf5b31b73325dc +size 127213 diff --git a/data/test/images/vision_efficient_tuning_test_sunflower.jpg b/data/test/images/vision_efficient_tuning_test_sunflower.jpg new file mode 100644 index 00000000..7ebf088a --- /dev/null +++ b/data/test/images/vision_efficient_tuning_test_sunflower.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c67733db75dc7fd773561a5091329fd5ee919b2268a3a65718261722607698f +size 226882 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index ba01b2e8..e5a2c4c1 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -808,6 +808,7 @@ class CVTrainers(object): image_classification = 'image-classification' image_fewshot_detection = 'image-fewshot-detection' nerf_recon_acc = 'nerf-recon-acc' + vision_efficient_tuning = 'vision-efficient-tuning' class NLPTrainers(object): @@ -919,6 +920,7 @@ class Preprocessors(object): bad_image_detecting_preprocessor = 'bad-image-detecting-preprocessor' nerf_recon_acc_preprocessor = 'nerf-recon-acc-preprocessor' controllable_image_generation_preprocessor = 'controllable-image-generation-preprocessor' + image_classification_preprocessor = 'image-classification-preprocessor' # nlp preprocessor sen_sim_tokenizer = 'sen-sim-tokenizer' diff --git a/modelscope/models/cv/vision_efficient_tuning/__init__.py b/modelscope/models/cv/vision_efficient_tuning/__init__.py index 05243554..80128f62 100644 --- a/modelscope/models/cv/vision_efficient_tuning/__init__.py +++ b/modelscope/models/cv/vision_efficient_tuning/__init__.py @@ -5,18 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .vision_efficient_tuning_adapter import VisionEfficientTuningAdapterModel - from .vision_efficient_tuning_prompt import VisionEfficientTuningPromptModel - from .vision_efficient_tuning_prefix import VisionEfficientTuningPrefixModel - from .vision_efficient_tuning_lora import VisionEfficientTuningLoRAModel + from .model import VisionEfficientTuningModel else: _import_structure = { - 'vision_efficient_tuning_adapter': - ['VisionEfficientTuningAdapterModel'], - 'vision_efficient_tuning_prompt': ['VisionEfficientTuningPromptModel'], - 'vision_efficient_tuning_prefix': ['VisionEfficientTuningPrefixModel'], - 'vision_efficient_tuning_lora': ['VisionEfficientTuningLoRAModel'], + 'model': ['VisionEfficientTuningModel'], } import sys diff --git a/modelscope/models/cv/vision_efficient_tuning/backbone.py b/modelscope/models/cv/vision_efficient_tuning/backbone.py index e7556ea1..691e4440 100644 --- a/modelscope/models/cv/vision_efficient_tuning/backbone.py +++ b/modelscope/models/cv/vision_efficient_tuning/backbone.py @@ -7,9 +7,10 @@ import torch import torch.nn as nn import torch.nn.functional as F -from .petl import Adapter, LoRA, Prefix, Prompt +from .petl import Adapter, LoRA, Prefix, Prompt, SideTune from .timm_vision_transformer import (Attention, Block, DropPath, LayerScale, - Mlp, PatchEmbed, VisionTransformer) + Mlp, PatchEmbed, VisionTransformer, + checkpoint_seq) class AttentionPETL(nn.Module): @@ -212,40 +213,74 @@ class VisionTransformerPETL(VisionTransformer): The implementation of several tuning methods (prompt, prefix, adapter, and LoRA) based on ViT. """ - def __init__( - self, - img_size=224, - patch_size=16, - in_chans=3, - num_classes=1000, - global_pool='token', - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4., - qkv_bias=True, - init_values=None, - class_token=True, - no_embed_class=False, - pre_norm=False, - fc_norm=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - weight_init='', - embed_layer=PatchEmbed, - norm_layer=None, - act_layer=None, - block_fn=Block, - prompt_length=None, - prompt_type=None, - prefix_length=None, - prefix_type=None, - adapter_length=None, - adapter_type=None, - lora_length=None, - lora_type=None, - ): + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + global_pool='token', + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=True, + init_values=None, + class_token=True, + no_embed_class=False, + pre_norm=False, + fc_norm=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + weight_init='', + embed_layer=PatchEmbed, + norm_layer=None, + act_layer=None, + block_fn=Block, + prompt_length=None, + prompt_type=None, + prefix_length=None, + prefix_type=None, + adapter_length=None, + adapter_type=None, + lora_length=None, + lora_type=None, + sidetune_length=None, + sidetune_type=None): + """ Initialize a Parameter-efficient Transfer Learning Method based on Vision Transformer. + + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + global_pool (str): type of global pooling for final sequence (default: 'token') + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + init_values: (float): layer-scale init values + class_token (bool): use class token + fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None) + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + weight_init (str): weight init scheme + embed_layer (nn.Module): patch embedding layer + norm_layer: (nn.Module): normalization layer + act_layer: (nn.Module): MLP activation layer + prompt_length: An integer indicating the length of prompt tuning. + prompt_type: A string indicating the type of prompt tuning. + prefix_length: An integer indicating the length of prefix tuning. + prefix_type: A string indicating the type of prefix tuning. + adapter_length: An integer indicating the length of adapter tuning. + adapter_type: A string indicating the type of adapter tuning. + lora_length: An integer indicating the length of LoRA tuning. + lora_type: A string indicating the type of LoRA tuning. + sidetune_length: An integer indicating the linear dimension. + sidetune_type: A string indicating the type of side network. + """ super().__init__() assert global_pool in ('', 'avg', 'token') @@ -349,3 +384,49 @@ class VisionTransformerPETL(VisionTransformer): if weight_init != 'skip': self.init_weights(weight_init) + + if sidetune_type is not None: + self.sidetune = SideTune(sidetune_length, sidetune_type) + else: + self.sidetune = None + + def forward_features(self, x): + """ feature forward function of VisionTransformer. + + Args: + x (Tensor): the input data. + Returns: + res (Dict): the output data, contains: + - inputs: the original input. + - x: the intermediate feature. + """ + res = dict(inputs=x) + x = self.patch_embed(x) + x = self._pos_embed(x) + x = self.norm_pre(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + x = self.norm(x) + res['x'] = x + return res + + def forward_head(self, res, pre_logits: bool = False): + """ head forward function of VisionTransformer. + + Args: + res (Dict): the input data, contains: + - inputs: the original input. + - x: the intermediate feature. + Returns: + x (Tensor): the output data. + """ + x = res['x'] + if self.global_pool: + x = x[:, self.num_prefix_tokens:].mean( + dim=1) if self.global_pool == 'avg' else x[:, 0] + if self.sidetune and 'inputs' in res: + x = self.sidetune(res['inputs'], x) + x = self.fc_norm(x) + return x if pre_logits else self.head(x) diff --git a/modelscope/models/cv/vision_efficient_tuning/model.py b/modelscope/models/cv/vision_efficient_tuning/model.py new file mode 100644 index 00000000..49b50272 --- /dev/null +++ b/modelscope/models/cv/vision_efficient_tuning/model.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. +from typing import Any, Dict + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.constant import Tasks +from .vision_efficient_tuning import VisionEfficientTuning + + +@MODELS.register_module( + Tasks.vision_efficient_tuning, module_name=Models.vision_efficient_tuning) +class VisionEfficientTuningModel(TorchModel): + """ The implementation of vision efficient tuning model based on TorchModel. + + This model is constructed with the following parts: + - 'backbone': pre-trained backbone model with parameters. + - 'head': classification head with fine-tuning. + """ + + def __init__(self, model_dir: str, **kwargs): + """ Initialize a vision efficient tuning model. + + Args: + model_dir: model id or path, where model_dir/pytorch_model.pt contains: + - 'backbone_weight': parameters of backbone. + - 'head_weight': parameters of head. + """ + super().__init__(model_dir) + + self.model = VisionEfficientTuning(model_dir=model_dir, **kwargs) + self.CLASSES = self.model.CLASSES + + self.device = torch.device( + 'cuda' if torch.cuda.is_available() else 'cpu') + self.model.to(self.device) + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + """ Dynamic forward function of vision efficient tuning model. + + Args: + input: the input data dict contanis: + - imgs: (B, 3, H, W). + - labels: (B), when training stage. + """ + output = self.model(**input) + return output diff --git a/modelscope/models/cv/vision_efficient_tuning/petl.py b/modelscope/models/cv/vision_efficient_tuning/petl.py index f43ba10b..b92112b6 100644 --- a/modelscope/models/cv/vision_efficient_tuning/petl.py +++ b/modelscope/models/cv/vision_efficient_tuning/petl.py @@ -1,8 +1,10 @@ # Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math +from collections import OrderedDict import torch import torch.nn as nn +import torchvision class Prompt(nn.Module): @@ -172,3 +174,101 @@ class Prefix(nn.Module): k, v = torch.cat((k, prefix_key), dim=2), torch.cat((v, prefix_value), dim=2) return q, k, v + + +class SideTune(nn.Module): + """The implementation of vision side-tuning method. + + Side-Tuning only needs to train one side network and + weights the output of pre-trained model and side network. + 'Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks' + by Zhang et al.(2019) + See https://arxiv.org/abs/1912.13503 + + Attributes: + sidetune_length: An integer indicating the linear dimension. + sidetune_type: A string indicating the type of side network. + """ + + def __init__(self, sidetune_length=None, sidetune_type=None): + super(SideTune, self).__init__() + self.sidetune_length = sidetune_length + self.sidetune_type = sidetune_type + if sidetune_type.lower() == 'fcn4': + self.side = FCN4(out_dims=self.sidetune_length) + if sidetune_type.lower() == 'alexnet': + mm = torchvision.models.alexnet(pretrained=True) + self.side = nn.Sequential( + OrderedDict([ + ('features', mm.features), ('avgpool', mm.avgpool), + ('flatten', nn.Flatten()), + ('fc', nn.Linear(9216, self.sidetune_length, bias=False)) + ])) + self.alpha = nn.Parameter(torch.tensor(0.0)) + + def forward(self, x, x_base): + alpha_squashed = torch.sigmoid(self.alpha) + x_side = self.side(x) + x_out = alpha_squashed * x_base + (1 - alpha_squashed) * x_side + return x_out + + +class FCN4(nn.Module): + """The implementation of simple FCN4 network for side network. + """ + + def __init__(self, out_dims=-1, **kwargs): + super(FCN4, self).__init__(**kwargs) + + self.conv1 = nn.Sequential( + nn.Conv2d( + 3, + 16, + kernel_size=3, + stride=1, + padding=1, + bias=False, + dilation=1), nn.GroupNorm(2, 16), nn.ReLU()) + self.conv2 = nn.Sequential( + nn.Conv2d( + 16, + 16, + kernel_size=3, + stride=2, + padding=0, + bias=False, + dilation=1), nn.GroupNorm(2, 16), nn.ReLU()) + self.conv3 = nn.Sequential( + nn.Conv2d( + 16, + 32, + kernel_size=3, + stride=2, + padding=0, + bias=False, + dilation=1), nn.GroupNorm(2, 32), nn.ReLU()) + self.conv4 = nn.Sequential( + nn.Conv2d( + 32, + 64, + kernel_size=3, + stride=1, + padding=0, + bias=False, + dilation=1), nn.GroupNorm(2, 64), nn.ReLU()) + self.pool = nn.AdaptiveAvgPool2d((1, 1)) + if out_dims > 0: + self.fc = nn.Linear(64, out_dims) + else: + self.fc = None + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.pool(x) + x = x.view(x.size(0), -1) + if self.fc is not None: + x = self.fc(x) + return x diff --git a/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py b/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py index 629e7fac..03d1ae14 100644 --- a/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py +++ b/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py @@ -1,65 +1,154 @@ # Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. import os +from collections import OrderedDict import torch +import torch.nn as nn +import torch.nn.functional as F -from modelscope.metainfo import Models -from modelscope.models.base.base_torch_model import TorchModel -from modelscope.models.builder import MODELS -from modelscope.utils.constant import ModelFile, Tasks +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModelFile -@MODELS.register_module( - Tasks.vision_efficient_tuning, module_name=Models.vision_efficient_tuning) -class VisionEfficientTuningModel(TorchModel): +class VisionEfficientTuning(nn.Module): """ The implementation of vision efficient tuning. This model is constructed with the following parts: - 'backbone': pre-trained backbone model with parameters. - 'head': classification head with fine-tuning. + - 'loss': loss function for training. """ - def __init__(self, model_dir: str, **kwargs): + def __init__(self, + backbone=None, + head=None, + loss=None, + pretrained=True, + finetune=False, + **kwargs): """ Initialize a vision efficient tuning model. Args: - model_dir: model id or path, where model_dir/pytorch_model.pt contains: - - 'backbone_cfg': config of backbone. - - 'backbone_weight': parameters of backbone. - - 'head_cfg': config of head. - - 'head_weight': parameters of head. - - 'CLASSES': list of label name. + backbone: config of backbone. + head: config of head. + loss: config of loss. + pretrained: whether to load the pretrained model. + finetune: whether to finetune the model. """ - from .backbone import VisionTransformerPETL from .head import ClassifierHead - super().__init__(model_dir) - model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) - model_dict = torch.load(model_path) + super(VisionEfficientTuning, self).__init__() - backbone_cfg = model_dict['backbone_cfg'] - if 'type' in backbone_cfg: - backbone_cfg.pop('type') - self.backbone_model = VisionTransformerPETL(**backbone_cfg) - self.backbone_model.load_state_dict( - model_dict['backbone_weight'], strict=True) + if backbone and 'type' in backbone: + backbone.pop('type') + self.backbone = VisionTransformerPETL(**backbone) + else: + self.backbone = None - head_cfg = model_dict['head_cfg'] - if 'type' in head_cfg: - head_cfg.pop('type') - self.head_model = ClassifierHead(**head_cfg) - self.head_model.load_state_dict(model_dict['head_weight'], strict=True) + # TODO Use a more elegant method to build the model. + if head and 'type' in head: + head.pop('type') + self.head = ClassifierHead(**head) + else: + self.head = None - self.CLASSES = model_dict['CLASSES'] + if loss and 'type' in loss: + self.loss = getattr(torch.nn, loss['type'])() + else: + self.loss = torch.nn.CrossEntropyLoss() - def forward(self, inputs): + self.CLASSES = kwargs.pop('CLASSES', None) + self.pretrained_cfg = kwargs.pop('pretrained_cfg', None) + + if pretrained: + assert 'model_dir' in kwargs, 'pretrained model dir is missing.' + model_path = os.path.join(kwargs['model_dir'], + ModelFile.TORCH_MODEL_FILE) + model_dict = torch.load(model_path, map_location='cpu') + + if self.backbone is None and 'backbone_cfg' in model_dict: + model_dict['backbone_cfg'].pop('type') + self.backbone = VisionTransformerPETL( + **model_dict['backbone_cfg']) + if self.head is None and 'head_cfg' in model_dict: + model_dict['head_cfg'].pop('type') + self.head = ClassifierHead(**model_dict['head_cfg']) + + if 'backbone_weight' in model_dict: + backbone_weight = model_dict['backbone_weight'] + if finetune and self.pretrained_cfg and 'unload_part' in self.pretrained_cfg \ + and 'backbone' in self.pretrained_cfg['unload_part']: + backbone_weight = self.filter_weight( + backbone_weight, + self.pretrained_cfg['unload_part']['backbone']) + self.backbone.load_state_dict(backbone_weight, strict=False) + + if 'head_weight' in model_dict: + head_weight = model_dict['head_weight'] + if finetune and self.pretrained_cfg and 'unload_part' in self.pretrained_cfg \ + and 'head' in self.pretrained_cfg['unload_part']: + head_weight = self.filter_weight( + head_weight, + self.pretrained_cfg['unload_part']['head']) + self.head.load_state_dict(head_weight, strict=False) + + self.CLASSES = model_dict[ + 'CLASSES'] if 'CLASSES' in model_dict else self.CLASSES + + def filter_weight(self, weights, unload_part=[]): + """ Filter parameters that the model does not need to load. + + Args: + weights: the parameters of the model. + unload_part: the config of unloading parameters. + """ + ret_dict = {} + for key, value in weights.items(): + flag = sum([p in key for p in unload_part]) > 0 + if not flag: + ret_dict[key] = value + return ret_dict + + def forward(self, imgs, labels=None, **kwargs): """ Dynamic forward function of vision efficient tuning. Args: - inputs: the input images (B, 3, H, W). + imgs: (B, 3, H, W). + labels: (B), when training stage. """ + return self.forward_train(imgs, labels, **kwargs) \ + if self.training else self.forward_test(imgs, labels, **kwargs) - backbone_output = self.backbone_model(inputs) - head_output = self.head_model(backbone_output) - return head_output + def forward_train(self, imgs, labels=None): + """ Dynamic forward function of training stage. + + Args: + imgs: (B, 3, H, W). + labels: (B), when training stage. + """ + output = OrderedDict() + + backbone_output = self.backbone(imgs) + head_output = self.head(backbone_output) + loss = self.loss(head_output, labels) + + output = {OutputKeys.LOSS: loss} + return output + + def forward_test(self, imgs, labels=None): + """ Dynamic forward function of testing stage. + + Args: + imgs: (B, 3, H, W). + labels: (B), when training stage. + """ + output = OrderedDict() + backbone_output = self.backbone(imgs) + head_output = self.head(backbone_output) + + scores = F.softmax(head_output, dim=1) + preds = scores.topk(1, 1, True, True)[-1].squeeze(-1) + + output = {OutputKeys.SCORES: scores, OutputKeys.LABELS: preds} + return output diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index e4948310..f1c40e12 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -314,7 +314,7 @@ class MsDataset: def type_converter(self, x): import torch - if self.to_tensor: + if self.to_tensor and not isinstance(x, torch.Tensor): return torch.tensor(x) else: return x diff --git a/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py b/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py index 2e3c45cc..50289168 100644 --- a/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py +++ b/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py @@ -10,7 +10,7 @@ from modelscope.metainfo import Pipelines from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import LoadImage +from modelscope.preprocessors import LoadImage, Preprocessor from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger @@ -40,25 +40,55 @@ class VisionEfficientTuningPipeline(Pipeline): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.model = self.model.to(self.device) self.model.eval() - self.transform = transforms.Compose([ - transforms.Resize(224), - transforms.ToTensor(), - transforms.Normalize( - mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - ]) - def preprocess(self, input: Input) -> Dict[str, Any]: - img = LoadImage.convert_to_img(input) - data = self.transform(img).unsqueeze(0).to(self.device) - return data + self.preprocessor = Preprocessor.from_pretrained( + self.model.model_dir, **kwargs) - def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + if self.preprocessor is None: + self.preprocessor = transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop((224, 224)), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + ]) + + def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: + """ Preprocess method build from transforms or Preprocessor """ + in_key = 'img_path:FILE' + other_in_keys = ['image'] + out_key = 'imgs' + if isinstance(self.preprocessor, Preprocessor): + if not isinstance(inputs, dict): + inputs = {in_key: inputs} + elif in_key not in inputs: + for ik in other_in_keys: + if ik in inputs and isinstance(inputs[ik], str): + inputs = {in_key: inputs[ik]} + break + data = self.preprocessor(inputs) + result = {out_key: data[out_key].unsqueeze(0).to(self.device)} + else: + if isinstance(inputs, dict): + for ik in [in_key] + other_in_keys: + if ik in inputs: + inputs = inputs[ik] + break + img = LoadImage.convert_to_img(inputs) + data = self.preprocessor(img) + result = {out_key: data.unsqueeze(0).to(self.device)} + return result + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: with torch.no_grad(): - results = self.model(input) + results = self.model(inputs) return results - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - scores = F.softmax(inputs, dim=1).cpu().numpy() + def postprocess(self, inputs: Dict[str, Any], + **post_params) -> Dict[str, Any]: + """ Postprocess for classification """ + scores = inputs[OutputKeys.SCORES].cpu().numpy() pred_scores = np.sort(scores, axis=1)[0][::-1][:5] pred_labels = np.argsort(scores, axis=1)[0][::-1][:5] diff --git a/modelscope/preprocessors/common.py b/modelscope/preprocessors/common.py index aa1db84c..68aaae36 100644 --- a/modelscope/preprocessors/common.py +++ b/modelscope/preprocessors/common.py @@ -7,6 +7,7 @@ from typing import Mapping import numpy as np import torch +from modelscope.utils.registry import default_group from .builder import PREPROCESSORS, build_preprocessor @@ -28,13 +29,14 @@ class Compose(object): for transform in transforms: if isinstance(transform, dict): if self.field_name is None: - transform = build_preprocessor(transform, field_name) + transform = build_preprocessor(transform, default_group) else: # if not found key in field_name, try field_name=None(default_group) try: transform = build_preprocessor(transform, field_name) except KeyError: - transform = build_preprocessor(transform, None) + transform = build_preprocessor(transform, + default_group) elif callable(transform): pass else: @@ -108,7 +110,8 @@ class ToTensor(object): self.keys = list(data.keys()) for key in self.keys: - data[key] = to_tensor(data[key]) + if key in data: + data[key] = to_tensor(data[key]) else: data = to_tensor(data) @@ -135,9 +138,93 @@ class Filter(object): reserved_data = {} for key in self.reserved_keys: - reserved_data[key] = data[key] + if key in data: + reserved_data[key] = data[key] return reserved_data def __repr__(self): return self.__class__.__name__ + f'(keys={self.reserved_keys})' + + +def to_numpy(data): + """Convert objects of various python types to `numpy.ndarray`. + + Args: + data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to + be converted. + """ + + if isinstance(data, torch.Tensor): + return data.numpy() + elif isinstance(data, np.ndarray): + return data + elif isinstance(data, Sequence) and not isinstance(data, str): + return np.asarray(data) + elif isinstance(data, int): + return np.asarray(data, dtype=np.int64) + elif isinstance(data, float): + return np.asarray(data, dtype=np.float64) + else: + raise TypeError(f'type {type(data)} cannot be converted to tensor.') + + +@PREPROCESSORS.register_module() +class ToNumpy(object): + """Convert target object to numpy.ndarray. + + Args: + keys (Sequence[str]): Key of data to be converted to numpy.ndarray. + Only valid when data is type of `Mapping`. If `keys` is None, + all values of keys ​​will be converted to numpy.ndarray by default. + """ + + def __init__(self, keys=None): + self.keys = keys + + def __call__(self, data): + if isinstance(data, Mapping): + if self.keys is None: + self.keys = list(data.keys()) + + for key in self.keys: + if key in data: + data[key] = to_numpy(data[key]) + else: + data = to_numpy(data) + + return data + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@PREPROCESSORS.register_module() +class Rename(object): + """Change the name of the input keys to output keys, respectively. + """ + + def __init__(self, input_keys=[], output_keys=[]): + self.input_keys = input_keys + self.output_keys = output_keys + + def __call__(self, data): + if isinstance(data, Mapping): + for in_key, out_key in zip(self.input_keys, self.output_keys): + if in_key in data and out_key not in data: + data[out_key] = data[in_key] + data.pop(in_key) + return data + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@PREPROCESSORS.register_module() +class Identity(object): + + def __init__(self): + pass + + def __call__(self, item): + return item diff --git a/modelscope/preprocessors/cv/__init__.py b/modelscope/preprocessors/cv/__init__.py index b9165a9d..439ae822 100644 --- a/modelscope/preprocessors/cv/__init__.py +++ b/modelscope/preprocessors/cv/__init__.py @@ -12,6 +12,7 @@ if TYPE_CHECKING: from .image_restoration_preprocessor import ImageRestorationPreprocessor from .bad_image_detecting_preprocessor import BadImageDetectingPreprocessor from .controllable_image_generation import ControllableImageGenerationPreprocessor + from .image_classification_preprocessor import ImageClassificationPreprocessor else: _import_structure = { @@ -24,6 +25,8 @@ else: 'bad_image_detecting_preprocessor': ['BadImageDetectingPreprocessor'], 'controllable_image_generation': ['ControllableImageGenerationPreprocessor'], + 'image_classification_preprocessor': + ['ImageClassificationPreprocessor'] } import sys diff --git a/modelscope/preprocessors/cv/cv2_transforms.py b/modelscope/preprocessors/cv/cv2_transforms.py new file mode 100644 index 00000000..cb8b8b1f --- /dev/null +++ b/modelscope/preprocessors/cv/cv2_transforms.py @@ -0,0 +1,559 @@ +# The implementation is adopted from opencv_transforms, +# made publicly available under the MIT license at +# https://github.com/jbohnslav/opencv_transforms/blob/master/opencv_transforms/functional.py +# https://github.com/jbohnslav/opencv_transforms/blob/master/opencv_transforms/transforms.py + +import collections +import math +import numbers +import random + +import cv2 +import numpy as np +import torch + +_cv2_pad_to_str = { + 'constant': cv2.BORDER_CONSTANT, + 'edge': cv2.BORDER_REPLICATE, + 'reflect': cv2.BORDER_REFLECT_101, + 'symmetric': cv2.BORDER_REFLECT +} +_cv2_interpolation_to_str = { + 'nearest': cv2.INTER_NEAREST, + 'bilinear': cv2.INTER_LINEAR, + 'area': cv2.INTER_AREA, + 'bicubic': cv2.INTER_CUBIC, + 'lanczos': cv2.INTER_LANCZOS4 +} +_cv2_interpolation_from_str = { + v: k + for k, v in _cv2_interpolation_to_str.items() +} + + +def _is_tensor_image(img): + return torch.is_tensor(img) and img.ndimension() == 3 + + +def _is_numpy_image(img): + return isinstance(img, np.ndarray) and (img.ndim in {2, 3}) + + +def to_tensor(pic): + """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. + See ``ToTensor`` for more details. + Args: + pic (PIL Image or numpy.ndarray): Image to be converted to tensor. + Returns: + Tensor: Converted image. + """ + if not (_is_numpy_image(pic)): + raise TypeError('pic should be ndarray. Got {}'.format(type(pic))) + + # handle numpy array + img = torch.from_numpy(pic.transpose((2, 0, 1))) + # backward compatibility + if isinstance(img, torch.ByteTensor) or img.dtype == torch.uint8: + return img.float().div(255) + else: + return img + + +def normalize(tensor, mean, std): + """Normalize a tensor image with mean and standard deviation. + .. note:: + This transform acts in-place, i.e., it mutates the input tensor. + See :class:`~torchvision.transforms.Normalize` for more details. + Args: + tensor (Tensor): Tensor image of size (C, H, W) to be normalized. + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channely. + Returns: + Tensor: Normalized Tensor image. + """ + if not _is_tensor_image(tensor): + raise TypeError('tensor is not a torch image.') + + # This is faster than using broadcasting, don't change without benchmarking + for t, m, s in zip(tensor, mean, std): + t.sub_(m).div_(s) + return tensor + + +def resize(img, size, interpolation=cv2.INTER_LINEAR): + r"""Resize the input numpy ndarray to the given size. + Args: + img (numpy ndarray): Image to be resized. + size (sequence or int): Desired output size. If size is a sequence like + (h, w), the output size will be matched to this. If size is an int, + the smaller edge of the image will be matched to this number maintaing + the aspect ratio. i.e, if height > width, then image will be rescaled to + :math:`\left(\text{size} \times \frac{\text{height}}{\text{width}}, \text{size}\right)` + interpolation (int, optional): Desired interpolation. Default is + ``cv2.INTER_LINEAR`` + Returns: + PIL Image: Resized image. + """ + if not _is_numpy_image(img): + raise TypeError('img should be numpy image. Got {}'.format(type(img))) + if not (isinstance(size, int) or # noqa: W504 + (isinstance(size, collections.abc.Iterable) and len(size) == 2)): + raise TypeError('Got inappropriate size arg: {}'.format(size)) + h, w = img.shape[0], img.shape[1] + + if isinstance(size, int): + if (w <= h and w == size) or (h <= w and h == size): + return img + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + else: + ow, oh = size[1], size[0] + output = cv2.resize(img, dsize=(ow, oh), interpolation=interpolation) + if img.shape[2] == 1: + return output[:, :, np.newaxis] + else: + return output + + +def pad(img, padding, fill=0, padding_mode='constant'): + r"""Pad the given numpy ndarray on all sides with specified padding mode and fill value. + Args: + img (numpy ndarray): image to be padded. + padding (int or tuple): Padding on each border. If a single int is provided this + is used to pad all borders. If tuple of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a tuple of length 4 is provided + this is the padding for the left, top, right and bottom borders + respectively. + fill: Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant + padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant. + - constant: pads with a constant value, this value is specified with fill + - edge: pads with the last value on the edge of the image + - reflect: pads with reflection of image (without repeating the last value on the edge) + padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + - symmetric: pads with reflection of image (repeating the last value on the edge) + padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + Returns: + Numpy image: padded image. + """ + if not _is_numpy_image(img): + raise TypeError('img should be numpy ndarray. Got {}'.format( + type(img))) + if not isinstance(padding, (numbers.Number, tuple, list)): + raise TypeError('Got inappropriate padding arg') + if not isinstance(fill, (numbers.Number, str, tuple)): + raise TypeError('Got inappropriate fill arg') + if not isinstance(padding_mode, str): + raise TypeError('Got inappropriate padding_mode arg') + if isinstance(padding, + collections.Sequence) and len(padding) not in [2, 4]: + raise ValueError( + 'Padding must be an int or a 2, or 4 element tuple, not a ' + + '{} element tuple'.format(len(padding))) + + assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \ + 'Padding mode should be either constant, edge, reflect or symmetric' + + if isinstance(padding, int): + pad_left = pad_right = pad_top = pad_bottom = padding + if isinstance(padding, collections.Sequence) and len(padding) == 2: + pad_left = pad_right = padding[0] + pad_top = pad_bottom = padding[1] + if isinstance(padding, collections.Sequence) and len(padding) == 4: + pad_left = padding[0] + pad_top = padding[1] + pad_right = padding[2] + pad_bottom = padding[3] + if img.shape[2] == 1: + return cv2.copyMakeBorder( + img, + top=pad_top, + bottom=pad_bottom, + left=pad_left, + right=pad_right, + borderType=_cv2_pad_to_str[padding_mode], + value=fill)[:, :, np.newaxis] + else: + return cv2.copyMakeBorder( + img, + top=pad_top, + bottom=pad_bottom, + left=pad_left, + right=pad_right, + borderType=_cv2_pad_to_str[padding_mode], + value=fill) + + +def crop(img, i, j, h, w): + """Crop the given PIL Image. + Args: + img (numpy ndarray): Image to be cropped. + i: Upper pixel coordinate. + j: Left pixel coordinate. + h: Height of the cropped image. + w: Width of the cropped image. + Returns: + numpy ndarray: Cropped image. + """ + if not _is_numpy_image(img): + raise TypeError('img should be numpy image. Got {}'.format(type(img))) + + return img[i:i + h, j:j + w, :] + + +def center_crop(img, output_size): + if isinstance(output_size, numbers.Number): + output_size = (int(output_size), int(output_size)) + h, w = img.shape[0:2] + th, tw = output_size + i = int(round((h - th) / 2.)) + j = int(round((w - tw) / 2.)) + return crop(img, i, j, th, tw) + + +def resized_crop(img, i, j, h, w, size, interpolation=cv2.INTER_LINEAR): + """Crop the given numpy ndarray and resize it to desired size. + Notably used in :class:`~torchvision.transforms.RandomResizedCrop`. + Args: + img (numpy ndarray): Image to be cropped. + i: Upper pixel coordinate. + j: Left pixel coordinate. + h: Height of the cropped image. + w: Width of the cropped image. + size (sequence or int): Desired output size. Same semantics as ``scale``. + interpolation (int, optional): Desired interpolation. Default is + ``cv2.INTER_CUBIC``. + Returns: + PIL Image: Cropped image. + """ + assert _is_numpy_image(img), 'img should be numpy image' + img = crop(img, i, j, h, w) + img = resize(img, size, interpolation=interpolation) + return img + + +def hflip(img): + """Horizontally flip the given numpy ndarray. + Args: + img (numpy ndarray): image to be flipped. + Returns: + numpy ndarray: Horizontally flipped image. + """ + if not _is_numpy_image(img): + raise TypeError('img should be numpy image. Got {}'.format(type(img))) + # img[:,::-1] is much faster, but doesn't work with torch.from_numpy()! + if img.shape[2] == 1: + return cv2.flip(img, 1)[:, :, np.newaxis] + else: + return cv2.flip(img, 1) + + +class ToTensor(object): + """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. + Converts a PIL Image or numpy.ndarray (H x W x C) in the range + [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. + """ + + def __call__(self, pic): + """ + Args: + pic (PIL Image or numpy.ndarray): Image to be converted to tensor. + Returns: + Tensor: Converted image. + """ + return to_tensor(pic) + + def __repr__(self): + return self.__class__.__name__ + '()' + + +class Normalize(object): + """Normalize a tensor image with mean and standard deviation. + Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform + will normalize each channel of the input ``torch.*Tensor`` i.e. + ``input[channel] = (input[channel] - mean[channel]) / std[channel]`` + .. note:: + This transform acts in-place, i.e., it mutates the input tensor. + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + """ + + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, tensor): + """ + Args: + tensor (Tensor): Tensor image of size (C, H, W) to be normalized. + Returns: + Tensor: Normalized Tensor image. + """ + return normalize(tensor, self.mean, self.std) + + def __repr__(self): + return self.__class__.__name__ + '(mean={0}, std={1})'.format( + self.mean, self.std) + + +class Resize(object): + """Resize the input numpy ndarray to the given size. + Args: + size (sequence or int): Desired output size. If size is a sequence like + (h, w), output size will be matched to this. If size is an int, + smaller edge of the image will be matched to this number. + i.e, if height > width, then image will be rescaled to + (size * height / width, size) + interpolation (int, optional): Desired interpolation. Default is + ``cv2.INTER_CUBIC``, bicubic interpolation + """ + + def __init__(self, size, interpolation=cv2.INTER_LINEAR): + # assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2) + if isinstance(size, int): + self.size = size + elif isinstance(size, collections.abc.Iterable) and len(size) == 2: + if type(size) == list: + size = tuple(size) + self.size = size + else: + raise ValueError('Unknown inputs for size: {}'.format(size)) + self.interpolation = interpolation + + def __call__(self, img): + """ + Args: + img (numpy ndarray): Image to be scaled. + Returns: + numpy ndarray: Rescaled image. + """ + return resize(img, self.size, self.interpolation) + + def __repr__(self): + interpolate_str = _cv2_interpolation_from_str[self.interpolation] + return self.__class__.__name__ + '(size={0}, interpolation={1})'.format( + self.size, interpolate_str) + + +class CenterCrop(object): + """Crops the given numpy ndarray at the center. + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. + """ + + def __init__(self, size): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + + def __call__(self, img): + """ + Args: + img (numpy ndarray): Image to be cropped. + Returns: + numpy ndarray: Cropped image. + """ + return center_crop(img, self.size) + + def __repr__(self): + return self.__class__.__name__ + '(size={0})'.format(self.size) + + +class RandomCrop(object): + """Crop the given numpy ndarray at a random location. + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. + padding (int or sequence, optional): Optional padding on each border + of the image. Default is None, i.e no padding. If a sequence of length + 4 is provided, it is used to pad left, top, right, bottom borders + respectively. If a sequence of length 2 is provided, it is used to + pad left/right, top/bottom borders, respectively. + pad_if_needed (boolean): It will pad the image if smaller than the + desired size to avoid raising an exception. + fill: Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant + padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant. + - constant: pads with a constant value, this value is specified with fill + - edge: pads with the last value on the edge of the image + - reflect: pads with reflection of image (without repeating the last value on the edge) + padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + - symmetric: pads with reflection of image (repeating the last value on the edge) + padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + + def __init__(self, + size, + padding=None, + pad_if_needed=False, + fill=0, + padding_mode='constant'): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + self.padding = padding + self.pad_if_needed = pad_if_needed + self.fill = fill + self.padding_mode = padding_mode + + @staticmethod + def get_params(img, output_size): + """Get parameters for ``crop`` for a random crop. + Args: + img (numpy ndarray): Image to be cropped. + output_size (tuple): Expected output size of the crop. + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. + """ + h, w = img.shape[0:2] + th, tw = output_size + if w == tw and h == th: + return 0, 0, h, w + + i = random.randint(0, h - th) + j = random.randint(0, w - tw) + return i, j, th, tw + + def __call__(self, img): + """ + Args: + img (numpy ndarray): Image to be cropped. + Returns: + numpy ndarray: Cropped image. + """ + if self.padding is not None: + img = pad(img, self.padding, self.fill, self.padding_mode) + + # pad the width if needed + if self.pad_if_needed and img.shape[1] < self.size[1]: + img = pad(img, (self.size[1] - img.shape[1], 0), self.fill, + self.padding_mode) + # pad the height if needed + if self.pad_if_needed and img.shape[0] < self.size[0]: + img = pad(img, (0, self.size[0] - img.shape[0]), self.fill, + self.padding_mode) + + i, j, h, w = self.get_params(img, self.size) + + return crop(img, i, j, h, w) + + def __repr__(self): + return self.__class__.__name__ + '(size={0}, padding={1})'.format( + self.size, self.padding) + + +class RandomResizedCrop(object): + """Crop the given numpy ndarray to random size and aspect ratio. + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop + is finally resized to given size. + This is popularly used to train the Inception networks. + Args: + size: expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: cv2.INTER_CUBIC + """ + + def __init__(self, + size, + scale=(0.08, 1.0), + ratio=(3. / 4., 4. / 3.), + interpolation=cv2.INTER_LINEAR): + self.size = (size, size) + self.interpolation = interpolation + self.scale = scale + self.ratio = ratio + + @staticmethod + def get_params(img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + Args: + img (numpy ndarray): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + for attempt in range(10): + area = img.shape[0] * img.shape[1] + target_area = random.uniform(*scale) * area + aspect_ratio = random.uniform(*ratio) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if random.random() < 0.5: + w, h = h, w + + if w <= img.shape[1] and h <= img.shape[0]: + i = random.randint(0, img.shape[0] - h) + j = random.randint(0, img.shape[1] - w) + return i, j, h, w + + # Fallback + w = min(img.shape[0], img.shape[1]) + i = (img.shape[0] - w) // 2 + j = (img.shape[1] - w) // 2 + return i, j, w, w + + def __call__(self, img): + """ + Args: + img (numpy ndarray): Image to be cropped and resized. + Returns: + numpy ndarray: Randomly cropped and resized image. + """ + i, j, h, w = self.get_params(img, self.scale, self.ratio) + return resized_crop(img, i, j, h, w, self.size, self.interpolation) + + def __repr__(self): + interpolate_str = _cv2_interpolation_from_str[self.interpolation] + format_string = self.__class__.__name__ + '(size={0}'.format(self.size) + format_string += ', scale={0}'.format( + tuple(round(s, 4) for s in self.scale)) + format_string += ', ratio={0}'.format( + tuple(round(r, 4) for r in self.ratio)) + format_string += ', interpolation={0})'.format(interpolate_str) + return format_string + + +class RandomHorizontalFlip(object): + """Horizontally flip the given PIL Image randomly with a given probability. + Args: + p (float): probability of the image being flipped. Default value is 0.5 + """ + + def __init__(self, p=0.5): + self.p = p + + def __call__(self, img): + """random + Args: + img (numpy ndarray): Image to be flipped. + Returns: + numpy ndarray: Randomly flipped image. + """ + if random.random() < self.p: + return hflip(img) + return img + + def __repr__(self): + return self.__class__.__name__ + '(p={})'.format(self.p) diff --git a/modelscope/preprocessors/cv/image_classification_preprocessor.py b/modelscope/preprocessors/cv/image_classification_preprocessor.py new file mode 100644 index 00000000..fa98315b --- /dev/null +++ b/modelscope/preprocessors/cv/image_classification_preprocessor.py @@ -0,0 +1,340 @@ +# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. +# The part implementation is also open-sourced by the authors, +# and available at https://github.com/alibaba/EssentialMC2 +import os +from typing import Any, Dict + +import cv2 +import numpy as np +import torch +import torchvision.transforms as transforms +from PIL import Image +from torchvision.transforms.functional import InterpolationMode + +import modelscope.preprocessors.cv.cv2_transforms as cv2_transforms +from modelscope.fileio import File +from modelscope.metainfo import Preprocessors +from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS, build_preprocessor +from modelscope.utils.constant import Fields, ModeKeys +from modelscope.utils.registry import default_group + +BACKEND_TORCHVISION = 'torchvision' +BACKEND_PILLOW = 'pillow' +BACKEND_CV2 = 'cv2' +BACKENDS = (BACKEND_PILLOW, BACKEND_CV2, BACKEND_TORCHVISION) + +INTERPOLATION_STYLE = { + 'bilinear': InterpolationMode('bilinear'), + 'nearest': InterpolationMode('nearest'), + 'bicubic': InterpolationMode('bicubic'), +} +INTERPOLATION_STYLE_CV2 = { + 'bilinear': cv2.INTER_LINEAR, + 'nearest': cv2.INTER_NEAREST, + 'bicubic': cv2.INTER_CUBIC, +} + + +def is_pil_image(img): + return isinstance(img, Image.Image) + + +def is_cv2_image(img): + return isinstance(img, np.ndarray) and img.dtype == np.uint8 + + +def is_tensor(t): + return isinstance(t, torch.Tensor) + + +class ImageTransform(object): + + def __init__(self, + backend=BACKEND_PILLOW, + input_key=None, + output_key=None): + self.input_key = input_key or 'img' + self.output_key = output_key or 'img' + self.backend = backend + + def check_image_type(self, input_img): + if self.backend == BACKEND_PILLOW: + assert is_pil_image(input_img), 'input should be PIL Image' + elif self.backend == BACKEND_CV2: + assert is_cv2_image( + input_img), 'input should be cv2 image(uint8 np.ndarray)' + + +@PREPROCESSORS.register_module(Fields.cv) +class RandomCrop(ImageTransform): + """ Crop a random portion of image. + If the image is torch Tensor, it is expected to have [..., H, W] shape. + + Args: + size (sequence or int): Desired output size. + If size is a sequence like (h, w), the output size will be matched to this. + If size is an int, the output size will be matched to (size, size). + padding (sequence or int): Optional padding on each border of the image. Default is None. + pad_if_needed (bool): It will pad the image if smaller than the desired size to avoid raising an exception. + fill (number or str or tuple): Pixel fill value for constant fill. Default is 0. + padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is constant. + """ + + def __init__(self, + size, + padding=None, + pad_if_needed=False, + fill=0, + padding_mode='constant', + **kwargs): + + super(RandomCrop, self).__init__(**kwargs) + assert self.backend in BACKENDS + if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION): + self.callable = transforms.RandomCrop( + size, + padding=padding, + pad_if_needed=pad_if_needed, + fill=fill, + padding_mode=padding_mode) + else: + self.callable = cv2_transforms.RandomCrop( + size, + padding=padding, + pad_if_needed=pad_if_needed, + fill=fill, + padding_mode=padding_mode) + + def __call__(self, item): + self.check_image_type(item[self.input_key]) + item[self.output_key] = self.callable(item[self.input_key]) + return item + + +@PREPROCESSORS.register_module(Fields.cv) +class RandomResizedCrop(ImageTransform): + """Crop a random portion of image and resize it to a given size. + + If the image is torch Tensor, it is expected to have [..., H, W] shape. + + Args: + size (int or sequence): Desired output size. + If size is a sequence like (h, w), the output size will be matched to this. + If size is an int, the output size will be matched to (size, size). + scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop, + before resizing. The scale is defined with respect to the area of the original image. + ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before + resizing. + interpolation (str): Desired interpolation string, 'bilinear', 'nearest', 'bicubic' are supported. + """ + + def __init__(self, + size, + scale=(0.08, 1.0), + ratio=(3. / 4., 4. / 3.), + interpolation='bilinear', + **kwargs): + super(RandomResizedCrop, self).__init__(**kwargs) + assert self.backend in BACKENDS + self.interpolation = interpolation + if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION): + assert interpolation in INTERPOLATION_STYLE + else: + assert interpolation in INTERPOLATION_STYLE_CV2 + self.callable = transforms.RandomResizedCrop(size, scale, ratio, INTERPOLATION_STYLE[interpolation]) \ + if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) \ + else cv2_transforms.RandomResizedCrop(size, scale, ratio, INTERPOLATION_STYLE_CV2[interpolation]) + + def __call__(self, item): + self.check_image_type(item[self.input_key]) + item[self.output_key] = self.callable(item[self.input_key]) + return item + + +@PREPROCESSORS.register_module(Fields.cv) +class Resize(ImageTransform): + """Resize image to a given size. + + If the image is torch Tensor, it is expected to have [..., H, W] shape. + + Args: + size (int or sequence): Desired output size. + If size is a sequence like (h, w), the output size will be matched to this. + If size is an int, the smaller edge of the image will be matched to this + number maintaining the aspect ratio. + interpolation (str): Desired interpolation string, 'bilinear', 'nearest', 'bicubic' are supported. + """ + + def __init__(self, size, interpolation='bilinear', **kwargs): + super(Resize, self).__init__(**kwargs) + assert self.backend in BACKENDS + self.size = size + self.interpolation = interpolation + if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION): + assert interpolation in INTERPOLATION_STYLE + else: + assert interpolation in INTERPOLATION_STYLE_CV2 + self.callable = transforms.Resize(size, INTERPOLATION_STYLE[interpolation]) \ + if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) \ + else cv2_transforms.Resize(size, INTERPOLATION_STYLE_CV2[interpolation]) + + def __call__(self, item): + self.check_image_type(item[self.input_key]) + item[self.output_key] = self.callable(item[self.input_key]) + return item + + +@PREPROCESSORS.register_module(Fields.cv) +class CenterCrop(ImageTransform): + """ Crops the given image at the center. + + If the image is torch Tensor, it is expected to have [..., H, W] shape. + + Args: + size (sequence or int): Desired output size. + If size is a sequence like (h, w), the output size will be matched to this. + If size is an int, the output size will be matched to (size, size). + """ + + def __init__(self, size, **kwargs): + super(CenterCrop, self).__init__(**kwargs) + assert self.backend in BACKENDS + self.size = size + self.callable = transforms.CenterCrop(size) \ + if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) else cv2_transforms.CenterCrop(size) + + def __call__(self, item): + self.check_image_type(item[self.input_key]) + item[self.output_key] = self.callable(item[self.input_key]) + return item + + +@PREPROCESSORS.register_module(Fields.cv) +class RandomHorizontalFlip(ImageTransform): + """ Horizontally flip the given image randomly with a given probability. + + If the image is torch Tensor, it is expected to have [..., H, W] shape. + + Args: + p (float): probability of the image being flipped. Default value is 0.5 + """ + + def __init__(self, p=0.5, **kwargs): + super(RandomHorizontalFlip, self).__init__(**kwargs) + assert self.backend in BACKENDS + self.callable = transforms.RandomHorizontalFlip(p) \ + if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) else cv2_transforms.RandomHorizontalFlip(p) + + def __call__(self, item): + self.check_image_type(item[self.input_key]) + item[self.output_key] = self.callable(item[self.input_key]) + return item + + +@PREPROCESSORS.register_module(Fields.cv) +class Normalize(ImageTransform): + """ Normalize a tensor image with mean and standard deviation. + This transform only support tensor image. + + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + """ + + def __init__(self, mean, std, **kwargs): + super(Normalize, self).__init__(**kwargs) + assert self.backend in BACKENDS + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.callable = transforms.Normalize(self.mean, self.std) \ + if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) else cv2_transforms.Normalize(self.mean, self.std) + + def __call__(self, item): + item[self.output_key] = self.callable(item[self.input_key]) + return item + + +@PREPROCESSORS.register_module(Fields.cv) +class ImageToTensor(ImageTransform): + """ Convert a ``PIL Image`` or ``numpy.ndarray`` or uint8 type tensor to a float32 tensor, + and scale output to [0.0, 1.0]. + """ + + def __init__(self, **kwargs): + super(ImageToTensor, self).__init__(**kwargs) + assert self.backend in BACKENDS + + if self.backend == BACKEND_PILLOW: + self.callable = transforms.ToTensor() + elif self.backend == BACKEND_CV2: + self.callable = cv2_transforms.ToTensor() + else: + self.callable = transforms.ConvertImageDtype(torch.float) + + def __call__(self, item): + item[self.output_key] = self.callable(item[self.input_key]) + return item + + +def build_preprocess_pipeline(pipeline, group_name=Fields.cv): + if isinstance(pipeline, list): + if len(pipeline) == 0: + return build_preprocessor( + dict(type='Identity'), field_name=default_group) + elif len(pipeline) == 1: + return build_preprocess_pipeline(pipeline[0]) + else: + return build_preprocessor( + dict( + type='Compose', transforms=pipeline, + field_name=group_name), + field_name=default_group) + elif isinstance(pipeline, dict): + return build_preprocessor(pipeline, field_name=group_name) + elif pipeline is None: + return build_preprocessor( + dict(type='Identity'), field_name=default_group) + else: + raise TypeError( + f'Expect pipeline_cfg to be dict or list or None, got {type(pipeline)}' + ) + + +@PREPROCESSORS.register_module( + Fields.cv, module_name=Preprocessors.image_classification_preprocessor) +class ImageClassificationPreprocessor(Preprocessor): + + def __init__(self, *args, **kwargs): + """image classification preprocessor in the fine-tune scenario + """ + super().__init__(*args, **kwargs) + + self.training = kwargs.pop('training', True) + self.preprocessor_train_cfg = kwargs.pop('train', None) + self.preprocessor_test_cfg = kwargs.pop('val', None) + + if self.preprocessor_train_cfg is not None: + self.train_preprocess_pipeline = build_preprocess_pipeline( + self.preprocessor_train_cfg) + + if self.preprocessor_test_cfg is not None: + self.test_preprocess_pipeline = build_preprocess_pipeline( + self.preprocessor_test_cfg) + + def __call__(self, results: Dict[str, Any]): + """process the raw input data + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + Dict[str, Any] | None: the preprocessed data + """ + if self.mode == ModeKeys.TRAIN: + pipline = self.train_preprocess_pipeline + else: + pipline = self.test_preprocess_pipeline + + return pipline(results) diff --git a/modelscope/preprocessors/image.py b/modelscope/preprocessors/image.py index 666d2b29..36ab2f2f 100644 --- a/modelscope/preprocessors/image.py +++ b/modelscope/preprocessors/image.py @@ -24,10 +24,12 @@ class LoadImage: "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1). Args: mode (str): See :ref:`PIL.Mode`. + backend (str): Type of loading image. Should be: cv2 or pillow. Default is pillow. """ - def __init__(self, mode='rgb'): + def __init__(self, mode='rgb', backend='pillow'): self.mode = mode.upper() + self.backend = backend def __call__(self, input: Union[str, Dict[str, str]]): """Call functions to load image and get image meta information. @@ -42,21 +44,38 @@ class LoadImage: else: image_path_or_url = input - bytes = File.read(image_path_or_url) - # TODO @wenmeng.zwm add opencv decode as optional - # we should also look at the input format which is the most commonly - # used in Mind' image related models - with io.BytesIO(bytes) as infile: - img = Image.open(infile) - img = ImageOps.exif_transpose(img) - img = img.convert(self.mode) + if self.backend == 'cv2': + storage = File._get_storage(image_path_or_url) + with storage.as_local_path(image_path_or_url) as img_path: + img = cv2.imread(img_path, cv2.IMREAD_COLOR) + if self.mode == 'RGB': + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) + img_h, img_w, img_c = img.shape[0], img.shape[1], img.shape[2] + img_shape = (img_h, img_w, img_c) + elif self.backend == 'pillow': + bytes = File.read(image_path_or_url) + # TODO @wenmeng.zwm add opencv decode as optional + # we should also look at the input format which is the most commonly + # used in Mind' image related models + with io.BytesIO(bytes) as infile: + img = Image.open(infile) + img = ImageOps.exif_transpose(img) + img = img.convert(self.mode) + img_shape = (img.size[1], img.size[0], 3) + else: + raise TypeError(f'backend should be either cv2 or pillow,' + f'but got {self.backend}') results = { 'filename': image_path_or_url, 'img': img, - 'img_shape': (img.size[1], img.size[0], 3), + 'img_shape': img_shape, 'img_field': 'img', } + if isinstance(input, dict): + input_ret = input.copy() + input_ret.update(results) + results = input_ret return results def __repr__(self): diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py index c31342ae..07a9440b 100644 --- a/modelscope/trainers/cv/__init__.py +++ b/modelscope/trainers/cv/__init__.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from .image_defrcn_fewshot_detection_trainer import ImageDefrcnFewshotTrainer from .cartoon_translation_trainer import CartoonTranslationTrainer from .nerf_recon_acc_trainer import NeRFReconAccTrainer + from .vision_efficient_tuning_trainer import VisionEfficientTuningTrainer else: _import_structure = { @@ -28,6 +29,7 @@ else: ['ImageDefrcnFewshotTrainer'], 'cartoon_translation_trainer': ['CartoonTranslationTrainer'], 'nerf_recon_acc_trainer': ['NeRFReconAccTrainer'], + 'vision_efficient_tuning_trainer': ['VisionEfficientTuningTrainer'], } import sys diff --git a/modelscope/trainers/cv/vision_efficient_tuning_trainer.py b/modelscope/trainers/cv/vision_efficient_tuning_trainer.py new file mode 100644 index 00000000..4c7dca73 --- /dev/null +++ b/modelscope/trainers/cv/vision_efficient_tuning_trainer.py @@ -0,0 +1,114 @@ +# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. +from typing import Union + +from torch import nn + +from modelscope.metainfo import Trainers +from modelscope.models.base import Model, TorchModel +from modelscope.trainers.builder import TRAINERS +from modelscope.trainers.default_config import merge_hooks +from modelscope.trainers.trainer import EpochBasedTrainer +from modelscope.utils.constant import ModeKeys + + +@TRAINERS.register_module(module_name=Trainers.vision_efficient_tuning) +class VisionEfficientTuningTrainer(EpochBasedTrainer): + """ Vision Efficient Tuning Trainer based on EpochBasedTrainer + + The trainer freezes the parameters of the pre-trained model and + tunes the extra parameters of the different parameter-efficient + transfer learning (PETL) method. + + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def build_model(self) -> Union[nn.Module, TorchModel]: + """ Instantiate a pytorch model and return. + + By default, we will create a model using config from configuration file. You can + override this method in a subclass. + + """ + model = Model.from_pretrained(self.model_dir, cfg_dict=self.cfg) + if 'freeze_cfg' in self.cfg['model']: + model = self.freeze(model, **self.cfg['model']['freeze_cfg']) + if not isinstance(model, nn.Module) and hasattr(model, 'model'): + return model.model + elif isinstance(model, nn.Module): + return model + + def train(self, *args, **kwargs): + self.print_model_params_status() + super().train(*args, **kwargs) + + def evaluate(self, *args, **kwargs): + metric_values = super().evaluate(*args, **kwargs) + return metric_values + + def freeze(self, model, freeze_part=[], train_part=[]): + """ Freeze or train the model based on the config. + + Args: + model: the current model. + freeze_part: the config of frozen parameters. + train_part: the config of trainable parameters. + """ + if hasattr(model, 'module'): + freeze_model = model.module + else: + freeze_model = model + + if freeze_part and len(freeze_part) > 0: + if 'backbone' in freeze_part: + part = freeze_part['backbone'] + for name, param in freeze_model.model.backbone.named_parameters( + ): + freeze_flag = sum([p in name for p in part]) > 0 + if freeze_flag: + param.requires_grad = False + elif 'head' in freeze_part: + part = freeze_part['head'] + for name, param in freeze_model.model.head.named_parameters(): + freeze_flag = sum([p in name for p in part]) > 0 + if freeze_flag: + param.requires_grad = False + + if train_part and len(train_part) > 0: + if 'backbone' in train_part: + part = train_part['backbone'] + for name, param in freeze_model.model.backbone.named_parameters( + ): + freeze_flag = sum([p in name for p in part]) > 0 + if freeze_flag: + param.requires_grad = True + elif 'head' in train_part: + part = train_part['head'] + for name, param in freeze_model.model.head.named_parameters(): + freeze_flag = sum([p in name for p in part]) > 0 + if freeze_flag: + param.requires_grad = True + return model + + def print_model_params_status(self, model=None, logger=None): + """Print the status and parameters of the model""" + if model is None: + model = self.model + if logger is None: + logger = self.logger + train_param_dict = {} + all_param_numel = 0 + for key, val in model.named_parameters(): + if val.requires_grad: + sub_key = '.'.join(key.split('.', 1)[-1].split('.', 2)[:2]) + if sub_key in train_param_dict: + train_param_dict[sub_key] += val.numel() + else: + train_param_dict[sub_key] = val.numel() + all_param_numel += val.numel() + train_param_numel = sum(train_param_dict.values()) + logger.info( + f'Load trainable params {train_param_numel} / {all_param_numel} = ' + f'{train_param_numel/all_param_numel:.2%}, ' + f'train part: {train_param_dict}.') diff --git a/modelscope/utils/chinese_utils.py b/modelscope/utils/chinese_utils.py index 86cf91a2..77ea34ce 100644 --- a/modelscope/utils/chinese_utils.py +++ b/modelscope/utils/chinese_utils.py @@ -3,8 +3,6 @@ import re import string -from zhconv import convert - CHINESE_PUNCTUATION = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。' ENGLISH_PUNCTUATION = string.punctuation @@ -58,6 +56,8 @@ def _is_chinese_char(cp: str) -> bool: def normalize_chinese_number(text): + from zhconv import convert + chinese_number = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九'] new_text = '' for x in text: diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py index 82bf1ada..99e61d45 100644 --- a/modelscope/utils/demo_utils.py +++ b/modelscope/utils/demo_utils.py @@ -30,6 +30,7 @@ TASKS_INPUT_TEMPLATES = { Tasks.ocr_detection: TasksIODescriptions.image_to_text, Tasks.ocr_recognition: TasksIODescriptions.image_to_text, Tasks.body_2d_keypoints: TasksIODescriptions.image_to_text, + Tasks.vision_efficient_tuning: TasksIODescriptions.image_to_text, # nlp tasks Tasks.text_classification: TasksIODescriptions.text_to_text, diff --git a/tests/pipelines/test_vision_efficient_tuning.py b/tests/pipelines/test_vision_efficient_tuning.py new file mode 100644 index 00000000..c88ed478 --- /dev/null +++ b/tests/pipelines/test_vision_efficient_tuning.py @@ -0,0 +1,154 @@ +# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. +import unittest + +from modelscope.models import Model +from modelscope.models.cv.vision_efficient_tuning.model import \ + VisionEfficientTuningModel +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class VisionEfficientTuningTest(unittest.TestCase, DemoCompatibilityCheck): + + def setUp(self) -> None: + self.task = Tasks.vision_efficient_tuning + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_adapter_run_pipeline(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter' + img_path = 'data/test/images/vision_efficient_tuning_test_1.png' + petl_pipeline = pipeline(self.task, model_id) + result = petl_pipeline(img_path) + print(f'Vision-efficient-tuning-adapter output: {result}.') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_vision_efficient_tuning_adapter_load_model_from_pretrained(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter' + model = Model.from_pretrained(model_id) + self.assertTrue(model.__class__ == VisionEfficientTuningModel) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_adapter_demo_compatibility(self): + self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter' + self.compatibility_check() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_lora_run_pipeline(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora' + img_path = 'data/test/images/vision_efficient_tuning_test_1.png' + petl_pipeline = pipeline(self.task, model_id) + result = petl_pipeline(img_path) + print(f'Vision-efficient-tuning-lora output: {result}.') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_vision_efficient_tuning_lora_load_model_from_pretrained(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora' + model = Model.from_pretrained(model_id) + self.assertTrue(model.__class__ == VisionEfficientTuningModel) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_lora_demo_compatibility(self): + self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora' + self.compatibility_check() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_prefix_run_pipeline(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix' + img_path = 'data/test/images/vision_efficient_tuning_test_1.png' + petl_pipeline = pipeline(self.task, model_id) + result = petl_pipeline(img_path) + print(f'Vision-efficient-tuning-prefix output: {result}.') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_vision_efficient_tuning_prefix_load_model_from_pretrained(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix' + model = Model.from_pretrained(model_id) + self.assertTrue(model.__class__ == VisionEfficientTuningModel) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_prefix_demo_compatibility(self): + self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix' + self.compatibility_check() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_prompt_run_pipeline(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt' + img_path = 'data/test/images/vision_efficient_tuning_test_1.png' + petl_pipeline = pipeline(self.task, model_id) + result = petl_pipeline(img_path) + print(f'Vision-efficient-tuning-prompt output: {result}.') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_vision_efficient_tuning_prompt_load_model_from_pretrained(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt' + model = Model.from_pretrained(model_id) + self.assertTrue(model.__class__ == VisionEfficientTuningModel) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_prompt_demo_compatibility(self): + self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt' + self.compatibility_check() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_bitfit_run_pipeline(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit' + img_path = 'data/test/images/vision_efficient_tuning_test_1.png' + petl_pipeline = pipeline(self.task, model_id) + result = petl_pipeline(img_path) + print(f'Vision-efficient-tuning-bitfit output: {result}.') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_vision_efficient_tuning_bitfit_load_model_from_pretrained(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit' + model = Model.from_pretrained(model_id) + self.assertTrue(model.__class__ == VisionEfficientTuningModel) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_bitfit_demo_compatibility(self): + self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit' + self.compatibility_check() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_sidetuning_run_pipeline(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning' + img_path = 'data/test/images/vision_efficient_tuning_test_1.png' + petl_pipeline = pipeline(self.task, model_id) + result = petl_pipeline(img_path) + print(f'Vision-efficient-tuning-sidetuning output: {result}.') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_vision_efficient_tuning_sidetuning_load_model_from_pretrained( + self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning' + model = Model.from_pretrained(model_id) + self.assertTrue(model.__class__ == VisionEfficientTuningModel) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_sidetuning_demo_compatibility(self): + self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning' + self.compatibility_check() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_utuning_run_pipeline(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning' + img_path = 'data/test/images/vision_efficient_tuning_test_1.png' + petl_pipeline = pipeline(self.task, model_id) + result = petl_pipeline(img_path) + print(f'Vision-efficient-tuning-utuning output: {result}.') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_vision_efficient_tuning_utuning_load_model_from_pretrained(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning' + model = Model.from_pretrained(model_id) + self.assertTrue(model.__class__ == VisionEfficientTuningModel) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_utuning_demo_compatibility(self): + self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning' + self.compatibility_check() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_vision_efficient_tuning_adapter.py b/tests/pipelines/test_vision_efficient_tuning_adapter.py deleted file mode 100644 index 4a06a40a..00000000 --- a/tests/pipelines/test_vision_efficient_tuning_adapter.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. -import unittest - -from modelscope.models import Model -from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \ - VisionEfficientTuningModel -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks -from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.test_utils import test_level - - -class VisionEfficientTuningAdapterTest(unittest.TestCase, - DemoCompatibilityCheck): - - def setUp(self) -> None: - self.task = Tasks.vision_efficient_tuning - self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter' - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_pipeline(self): - - petl_pipeline = pipeline(self.task, self.model_id) - result = petl_pipeline( - 'data/test/images/vision_efficient_tuning_test_1.png') - - print(f'Vision-efficient-tuning-adapter output: {result}.') - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_load_model_from_pretrained(self): - model = Model.from_pretrained( - 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter') - self.assertTrue(model.__class__ == VisionEfficientTuningModel) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/pipelines/test_vision_efficient_tuning_lora.py b/tests/pipelines/test_vision_efficient_tuning_lora.py deleted file mode 100644 index 6c49453a..00000000 --- a/tests/pipelines/test_vision_efficient_tuning_lora.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. -import unittest - -from modelscope.models import Model -from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \ - VisionEfficientTuningModel -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks -from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.test_utils import test_level - - -class VisionEfficientTuningLoRATest(unittest.TestCase, DemoCompatibilityCheck): - - def setUp(self) -> None: - self.task = Tasks.vision_efficient_tuning - self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora' - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_pipeline(self): - - petl_pipeline = pipeline(self.task, self.model_id) - result = petl_pipeline( - 'data/test/images/vision_efficient_tuning_test_1.png') - - print(f'Vision-efficient-tuning-lora output: {result}.') - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_load_model_from_pretrained(self): - model = Model.from_pretrained( - 'damo/cv_vitb16_classification_vision-efficient-tuning-lora') - self.assertTrue(model.__class__ == VisionEfficientTuningModel) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/pipelines/test_vision_efficient_tuning_prefix.py b/tests/pipelines/test_vision_efficient_tuning_prefix.py deleted file mode 100644 index 0eca5819..00000000 --- a/tests/pipelines/test_vision_efficient_tuning_prefix.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. -import unittest - -from modelscope.models import Model -from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \ - VisionEfficientTuningModel -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks -from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.test_utils import test_level - - -class VisionEfficientTuningPrefixTest(unittest.TestCase, - DemoCompatibilityCheck): - - def setUp(self) -> None: - self.task = Tasks.vision_efficient_tuning - self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix' - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_pipeline(self): - - petl_pipeline = pipeline(self.task, self.model_id) - result = petl_pipeline( - 'data/test/images/vision_efficient_tuning_test_1.png') - - print(f'Vision-efficient-tuning-prefix output: {result}.') - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_load_model_from_pretrained(self): - model = Model.from_pretrained( - 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix') - self.assertTrue(model.__class__ == VisionEfficientTuningModel) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/pipelines/test_vision_efficient_tuning_prompt.py b/tests/pipelines/test_vision_efficient_tuning_prompt.py deleted file mode 100644 index 97d97811..00000000 --- a/tests/pipelines/test_vision_efficient_tuning_prompt.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. -import unittest - -from modelscope.models import Model -from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \ - VisionEfficientTuningModel -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks -from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.test_utils import test_level - - -class VisionEfficientTuningPromptTest(unittest.TestCase, - DemoCompatibilityCheck): - - def setUp(self) -> None: - self.task = Tasks.vision_efficient_tuning - self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt' - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_pipeline(self): - - petl_pipeline = pipeline(self.task, self.model_id) - result = petl_pipeline( - 'data/test/images/vision_efficient_tuning_test_1.png') - - print(f'Vision-efficient-tuning-prompt output: {result}.') - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_load_model_from_pretrained(self): - model = Model.from_pretrained( - 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt') - self.assertTrue(model.__class__ == VisionEfficientTuningModel) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/trainers/test_finetune_vision_efficient_tuning.py b/tests/trainers/test_finetune_vision_efficient_tuning.py new file mode 100644 index 00000000..8719c64f --- /dev/null +++ b/tests/trainers/test_finetune_vision_efficient_tuning.py @@ -0,0 +1,355 @@ +# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved. +import os +import shutil +import tempfile +import unittest + +from modelscope.metainfo import Trainers +from modelscope.msdatasets import MsDataset +from modelscope.trainers import build_trainer +from modelscope.utils.test_utils import test_level + + +class TestVisionEfficientTuningTrainer(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + + self.train_dataset = MsDataset.load( + 'foundation_model_evaluation_benchmark', + namespace='damo', + subset_name='OxfordFlowers', + split='train') + + self.eval_dataset = MsDataset.load( + 'foundation_model_evaluation_benchmark', + namespace='damo', + subset_name='OxfordFlowers', + split='eval') + + self.max_epochs = 1 + self.num_classes = 102 + self.tune_length = 10 + + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_adapter_train(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter' + + def cfg_modify_fn(cfg): + cfg.model.head.num_classes = self.num_classes + cfg.model.finetune = True + cfg.train.max_epochs = self.max_epochs + cfg.train.lr_scheduler.T_max = self.max_epochs + cfg.model.backbone.adapter_length = self.tune_length + return cfg + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + cfg_modify_fn=cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + trainer.train() + result = trainer.evaluate() + print(f'Vision-efficient-tuning-adapter train output: {result}.') + + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_adapter_eval(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter' + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=None, + eval_dataset=self.eval_dataset) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + result = trainer.evaluate() + print(f'Vision-efficient-tuning-adapter eval output: {result}.') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_lora_train(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora' + + def cfg_modify_fn(cfg): + cfg.model.head.num_classes = self.num_classes + cfg.model.finetune = True + cfg.train.max_epochs = self.max_epochs + cfg.train.lr_scheduler.T_max = self.max_epochs + cfg.model.backbone.lora_length = self.tune_length + return cfg + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + cfg_modify_fn=cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + trainer.train() + result = trainer.evaluate() + print(f'Vision-efficient-tuning-lora train output: {result}.') + + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_lora_eval(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora' + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=None, + eval_dataset=self.eval_dataset) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + result = trainer.evaluate() + print(f'Vision-efficient-tuning-lora eval output: {result}.') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_prefix_train(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix' + + def cfg_modify_fn(cfg): + cfg.model.head.num_classes = self.num_classes + cfg.model.finetune = True + cfg.train.max_epochs = self.max_epochs + cfg.train.lr_scheduler.T_max = self.max_epochs + cfg.model.backbone.prefix_length = self.tune_length + return cfg + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + cfg_modify_fn=cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + trainer.train() + result = trainer.evaluate() + print(f'Vision-efficient-tuning-prefix train output: {result}.') + + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_prefix_eval(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix' + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=None, + eval_dataset=self.eval_dataset) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + result = trainer.evaluate() + print(f'Vision-efficient-tuning-prefix eval output: {result}.') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_prompt_train(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt' + + def cfg_modify_fn(cfg): + cfg.model.head.num_classes = self.num_classes + cfg.model.finetune = True + cfg.train.max_epochs = self.max_epochs + cfg.train.lr_scheduler.T_max = self.max_epochs + cfg.model.backbone.prompt_length = self.tune_length + return cfg + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + cfg_modify_fn=cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + trainer.train() + result = trainer.evaluate() + print(f'Vision-efficient-tuning-prompt train output: {result}.') + + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_prompt_eval(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt' + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=None, + eval_dataset=self.eval_dataset) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + result = trainer.evaluate() + print(f'Vision-efficient-tuning-prompt eval output: {result}.') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_bitfit_train(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit' + + # model_id = '../modelcard/cv_vitb16_classification_vision-efficient-tuning-bitfit' + def cfg_modify_fn(cfg): + cfg.model.head.num_classes = self.num_classes + cfg.model.finetune = True + cfg.train.max_epochs = self.max_epochs + cfg.train.lr_scheduler.T_max = self.max_epochs + return cfg + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + cfg_modify_fn=cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + trainer.train() + result = trainer.evaluate() + print(f'Vision-efficient-tuning-bitfit train output: {result}.') + + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_bitfit_eval(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit' + # model_id = '../modelcard/cv_vitb16_classification_vision-efficient-tuning-bitfit' + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=None, + eval_dataset=self.eval_dataset) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + result = trainer.evaluate() + print(f'Vision-efficient-tuning-bitfit eval output: {result}.') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_sidetuning_train(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning' + + def cfg_modify_fn(cfg): + cfg.model.head.num_classes = self.num_classes + cfg.model.finetune = True + cfg.train.max_epochs = self.max_epochs + cfg.train.lr_scheduler.T_max = self.max_epochs + return cfg + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + cfg_modify_fn=cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + trainer.train() + result = trainer.evaluate() + print(f'Vision-efficient-tuning-sidetuning train output: {result}.') + + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_sidetuning_eval(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning' + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=None, + eval_dataset=self.eval_dataset) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + result = trainer.evaluate() + print(f'Vision-efficient-tuning-sidetuning eval output: {result}.') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_utuning_train(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning' + + def cfg_modify_fn(cfg): + cfg.model.head.num_classes = self.num_classes + cfg.model.finetune = True + cfg.train.max_epochs = self.max_epochs + cfg.train.lr_scheduler.T_max = self.max_epochs + return cfg + + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=self.train_dataset, + eval_dataset=self.eval_dataset, + cfg_modify_fn=cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + trainer.train() + result = trainer.evaluate() + print(f'Vision-efficient-tuning-utuning train output: {result}.') + + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_vision_efficient_tuning_utuning_eval(self): + model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning' + kwargs = dict( + model=model_id, + work_dir=self.tmp_dir, + train_dataset=None, + eval_dataset=self.eval_dataset) + + trainer = build_trainer( + name=Trainers.vision_efficient_tuning, default_args=kwargs) + result = trainer.evaluate() + print(f'Vision-efficient-tuning-utuning eval output: {result}.') + + +if __name__ == '__main__': + unittest.main()