From bf3a2b6c090ab1a65800cc534a567106f012f1a7 Mon Sep 17 00:00:00 2001
From: "zeyinzi.jzyz" <zeyinzi.jzyz@alibaba-inc.com>
Date: Wed, 8 Mar 2023 16:42:23 +0800
Subject: [PATCH] support vision efficient tuning finetune
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## 查看改动点 ↓↓↓
### vision efficient tuning finetune
- Model模块改造成适配训练的
- Model模块在支持训练同时向下兼容之前发布的modecard
- Pipline兼容modelcard加载的preprocessor或直接定义的
- 添加 ImageClassificationPreprocessor （非mmcv版本）
- 添加 VisionEfficientTuningTrainer
- ~~添加 opencv_transforms==0.0.6~~ (以源代码引入必要)

### Modelcard
- test pipeline和trainer合并到一起
- 新增3个模型的test
- 新增demo service

### 公共组件
- ms_dataset.py: fix warning, [UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or xxx]
- preprocessor添加common：ToNumpy、Rename、Identity
- preprocessor common对于dict进行key判断再取值。
- ~~修复learning rate在iter级别变化的逻辑。~~ （本次不做了）
- ~~修复非dist状态下train data没有进行shuffle的bug。~~ (Master已有人改了)
- 修复训练时调用util中非cv包的异常 zhconv。

### 其他
- 为防止新引入的preprocessor模块在config中被原代码加载，导致在其他人做CI时会报错；所以暂时没有添加新的tag，等CR完成后，会进行打tag再rerun CI。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11762108

* support vision efficient tuning finetune

* update test case

* update shuffle on IterableDataset

* update bitfit & sidetuning

* compatible with base trainer
---
 .../vision_efficient_tuning_test_apple.jpg    |   3 +
 ...vision_efficient_tuning_test_sunflower.jpg |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/vision_efficient_tuning/__init__.py    |  11 +-
 .../cv/vision_efficient_tuning/backbone.py    | 153 +++--
 .../cv/vision_efficient_tuning/model.py       |  49 ++
 .../models/cv/vision_efficient_tuning/petl.py | 100 ++++
 .../vision_efficient_tuning.py                | 159 +++--
 modelscope/msdatasets/ms_dataset.py           |   2 +-
 .../cv/vision_efficient_tuning_pipeline.py    |  60 +-
 modelscope/preprocessors/common.py            |  95 ++-
 modelscope/preprocessors/cv/__init__.py       |   3 +
 modelscope/preprocessors/cv/cv2_transforms.py | 559 ++++++++++++++++++
 .../cv/image_classification_preprocessor.py   | 340 +++++++++++
 modelscope/preprocessors/image.py             |  39 +-
 modelscope/trainers/cv/__init__.py            |   2 +
 .../cv/vision_efficient_tuning_trainer.py     | 114 ++++
 modelscope/utils/chinese_utils.py             |   4 +-
 modelscope/utils/demo_utils.py                |   1 +
 .../pipelines/test_vision_efficient_tuning.py | 154 +++++
 .../test_vision_efficient_tuning_adapter.py   |  37 --
 .../test_vision_efficient_tuning_lora.py      |  36 --
 .../test_vision_efficient_tuning_prefix.py    |  37 --
 .../test_vision_efficient_tuning_prompt.py    |  37 --
 .../test_finetune_vision_efficient_tuning.py  | 355 +++++++++++
 25 files changed, 2096 insertions(+), 259 deletions(-)
 create mode 100644 data/test/images/vision_efficient_tuning_test_apple.jpg
 create mode 100644 data/test/images/vision_efficient_tuning_test_sunflower.jpg
 create mode 100644 modelscope/models/cv/vision_efficient_tuning/model.py
 create mode 100644 modelscope/preprocessors/cv/cv2_transforms.py
 create mode 100644 modelscope/preprocessors/cv/image_classification_preprocessor.py
 create mode 100644 modelscope/trainers/cv/vision_efficient_tuning_trainer.py
 create mode 100644 tests/pipelines/test_vision_efficient_tuning.py
 delete mode 100644 tests/pipelines/test_vision_efficient_tuning_adapter.py
 delete mode 100644 tests/pipelines/test_vision_efficient_tuning_lora.py
 delete mode 100644 tests/pipelines/test_vision_efficient_tuning_prefix.py
 delete mode 100644 tests/pipelines/test_vision_efficient_tuning_prompt.py
 create mode 100644 tests/trainers/test_finetune_vision_efficient_tuning.py

diff --git a/data/test/images/vision_efficient_tuning_test_apple.jpg b/data/test/images/vision_efficient_tuning_test_apple.jpg
new file mode 100644
index 00000000..7da7fcab
--- /dev/null
+++ b/data/test/images/vision_efficient_tuning_test_apple.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:407d70db9f01bc7a6f34377e36c3f2f5eefdfca8bd3c578226bf5b31b73325dc
+size 127213
diff --git a/data/test/images/vision_efficient_tuning_test_sunflower.jpg b/data/test/images/vision_efficient_tuning_test_sunflower.jpg
new file mode 100644
index 00000000..7ebf088a
--- /dev/null
+++ b/data/test/images/vision_efficient_tuning_test_sunflower.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c67733db75dc7fd773561a5091329fd5ee919b2268a3a65718261722607698f
+size 226882
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index ba01b2e8..e5a2c4c1 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -808,6 +808,7 @@ class CVTrainers(object):
     image_classification = 'image-classification'
     image_fewshot_detection = 'image-fewshot-detection'
     nerf_recon_acc = 'nerf-recon-acc'
+    vision_efficient_tuning = 'vision-efficient-tuning'
 
 
 class NLPTrainers(object):
@@ -919,6 +920,7 @@ class Preprocessors(object):
     bad_image_detecting_preprocessor = 'bad-image-detecting-preprocessor'
     nerf_recon_acc_preprocessor = 'nerf-recon-acc-preprocessor'
     controllable_image_generation_preprocessor = 'controllable-image-generation-preprocessor'
+    image_classification_preprocessor = 'image-classification-preprocessor'
 
     # nlp preprocessor
     sen_sim_tokenizer = 'sen-sim-tokenizer'
diff --git a/modelscope/models/cv/vision_efficient_tuning/__init__.py b/modelscope/models/cv/vision_efficient_tuning/__init__.py
index 05243554..80128f62 100644
--- a/modelscope/models/cv/vision_efficient_tuning/__init__.py
+++ b/modelscope/models/cv/vision_efficient_tuning/__init__.py
@@ -5,18 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
 
-    from .vision_efficient_tuning_adapter import VisionEfficientTuningAdapterModel
-    from .vision_efficient_tuning_prompt import VisionEfficientTuningPromptModel
-    from .vision_efficient_tuning_prefix import VisionEfficientTuningPrefixModel
-    from .vision_efficient_tuning_lora import VisionEfficientTuningLoRAModel
+    from .model import VisionEfficientTuningModel
 
 else:
     _import_structure = {
-        'vision_efficient_tuning_adapter':
-        ['VisionEfficientTuningAdapterModel'],
-        'vision_efficient_tuning_prompt': ['VisionEfficientTuningPromptModel'],
-        'vision_efficient_tuning_prefix': ['VisionEfficientTuningPrefixModel'],
-        'vision_efficient_tuning_lora': ['VisionEfficientTuningLoRAModel'],
+        'model': ['VisionEfficientTuningModel'],
     }
 
     import sys
diff --git a/modelscope/models/cv/vision_efficient_tuning/backbone.py b/modelscope/models/cv/vision_efficient_tuning/backbone.py
index e7556ea1..691e4440 100644
--- a/modelscope/models/cv/vision_efficient_tuning/backbone.py
+++ b/modelscope/models/cv/vision_efficient_tuning/backbone.py
@@ -7,9 +7,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .petl import Adapter, LoRA, Prefix, Prompt
+from .petl import Adapter, LoRA, Prefix, Prompt, SideTune
 from .timm_vision_transformer import (Attention, Block, DropPath, LayerScale,
-                                      Mlp, PatchEmbed, VisionTransformer)
+                                      Mlp, PatchEmbed, VisionTransformer,
+                                      checkpoint_seq)
 
 
 class AttentionPETL(nn.Module):
@@ -212,40 +213,74 @@ class VisionTransformerPETL(VisionTransformer):
     The implementation of several tuning methods (prompt, prefix, adapter, and LoRA) based on ViT.
     """
 
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        num_classes=1000,
-        global_pool='token',
-        embed_dim=768,
-        depth=12,
-        num_heads=12,
-        mlp_ratio=4.,
-        qkv_bias=True,
-        init_values=None,
-        class_token=True,
-        no_embed_class=False,
-        pre_norm=False,
-        fc_norm=None,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.,
-        weight_init='',
-        embed_layer=PatchEmbed,
-        norm_layer=None,
-        act_layer=None,
-        block_fn=Block,
-        prompt_length=None,
-        prompt_type=None,
-        prefix_length=None,
-        prefix_type=None,
-        adapter_length=None,
-        adapter_type=None,
-        lora_length=None,
-        lora_type=None,
-    ):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 global_pool='token',
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 init_values=None,
+                 class_token=True,
+                 no_embed_class=False,
+                 pre_norm=False,
+                 fc_norm=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 weight_init='',
+                 embed_layer=PatchEmbed,
+                 norm_layer=None,
+                 act_layer=None,
+                 block_fn=Block,
+                 prompt_length=None,
+                 prompt_type=None,
+                 prefix_length=None,
+                 prefix_type=None,
+                 adapter_length=None,
+                 adapter_type=None,
+                 lora_length=None,
+                 lora_type=None,
+                 sidetune_length=None,
+                 sidetune_type=None):
+        """ Initialize a Parameter-efficient Transfer Learning Method based on Vision Transformer.
+
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            global_pool (str): type of global pooling for final sequence (default: 'token')
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            init_values: (float): layer-scale init values
+            class_token (bool): use class token
+            fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None)
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            weight_init (str): weight init scheme
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            act_layer: (nn.Module): MLP activation layer
+            prompt_length: An integer indicating the length of prompt tuning.
+            prompt_type: A string indicating the type of prompt tuning.
+            prefix_length: An integer indicating the length of prefix tuning.
+            prefix_type: A string indicating the type of prefix tuning.
+            adapter_length: An integer indicating the length of adapter tuning.
+            adapter_type: A string indicating the type of adapter tuning.
+            lora_length: An integer indicating the length of LoRA tuning.
+            lora_type: A string indicating the type of LoRA tuning.
+            sidetune_length: An integer indicating the linear dimension.
+            sidetune_type: A string indicating the type of side network.
+        """
 
         super().__init__()
         assert global_pool in ('', 'avg', 'token')
@@ -349,3 +384,49 @@ class VisionTransformerPETL(VisionTransformer):
 
         if weight_init != 'skip':
             self.init_weights(weight_init)
+
+        if sidetune_type is not None:
+            self.sidetune = SideTune(sidetune_length, sidetune_type)
+        else:
+            self.sidetune = None
+
+    def forward_features(self, x):
+        """ feature forward function of VisionTransformer.
+
+        Args:
+            x (Tensor): the input data.
+        Returns:
+            res (Dict): the output data, contains:
+                - inputs: the original input.
+                - x: the intermediate feature.
+        """
+        res = dict(inputs=x)
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.norm_pre(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        x = self.norm(x)
+        res['x'] = x
+        return res
+
+    def forward_head(self, res, pre_logits: bool = False):
+        """ head forward function of VisionTransformer.
+
+        Args:
+            res (Dict): the input data, contains:
+                - inputs: the original input.
+                - x: the intermediate feature.
+        Returns:
+            x (Tensor): the output data.
+        """
+        x = res['x']
+        if self.global_pool:
+            x = x[:, self.num_prefix_tokens:].mean(
+                dim=1) if self.global_pool == 'avg' else x[:, 0]
+        if self.sidetune and 'inputs' in res:
+            x = self.sidetune(res['inputs'], x)
+        x = self.fc_norm(x)
+        return x if pre_logits else self.head(x)
diff --git a/modelscope/models/cv/vision_efficient_tuning/model.py b/modelscope/models/cv/vision_efficient_tuning/model.py
new file mode 100644
index 00000000..49b50272
--- /dev/null
+++ b/modelscope/models/cv/vision_efficient_tuning/model.py
@@ -0,0 +1,49 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .vision_efficient_tuning import VisionEfficientTuning
+
+
+@MODELS.register_module(
+    Tasks.vision_efficient_tuning, module_name=Models.vision_efficient_tuning)
+class VisionEfficientTuningModel(TorchModel):
+    """ The implementation of vision efficient tuning model based on TorchModel.
+
+    This model is constructed with the following parts:
+        - 'backbone': pre-trained backbone model with parameters.
+        - 'head': classification head with fine-tuning.
+    """
+
+    def __init__(self, model_dir: str, **kwargs):
+        """ Initialize a vision efficient tuning model.
+
+        Args:
+          model_dir: model id or path, where model_dir/pytorch_model.pt contains:
+                    - 'backbone_weight': parameters of backbone.
+                    - 'head_weight': parameters of head.
+        """
+        super().__init__(model_dir)
+
+        self.model = VisionEfficientTuning(model_dir=model_dir, **kwargs)
+        self.CLASSES = self.model.CLASSES
+
+        self.device = torch.device(
+            'cuda' if torch.cuda.is_available() else 'cpu')
+        self.model.to(self.device)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """ Dynamic forward function of vision efficient tuning model.
+
+        Args:
+            input: the input data dict contanis:
+                - imgs: (B, 3, H, W).
+                - labels: (B), when training stage.
+        """
+        output = self.model(**input)
+        return output
diff --git a/modelscope/models/cv/vision_efficient_tuning/petl.py b/modelscope/models/cv/vision_efficient_tuning/petl.py
index f43ba10b..b92112b6 100644
--- a/modelscope/models/cv/vision_efficient_tuning/petl.py
+++ b/modelscope/models/cv/vision_efficient_tuning/petl.py
@@ -1,8 +1,10 @@
 # Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
+from collections import OrderedDict
 
 import torch
 import torch.nn as nn
+import torchvision
 
 
 class Prompt(nn.Module):
@@ -172,3 +174,101 @@ class Prefix(nn.Module):
         k, v = torch.cat((k, prefix_key), dim=2), torch.cat((v, prefix_value),
                                                             dim=2)
         return q, k, v
+
+
+class SideTune(nn.Module):
+    """The implementation of vision side-tuning method.
+
+    Side-Tuning only needs to train one side network and
+    weights the output of pre-trained model and side network.
+    'Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks'
+    by Zhang et al.(2019)
+    See https://arxiv.org/abs/1912.13503
+
+    Attributes:
+        sidetune_length: An integer indicating the linear dimension.
+        sidetune_type: A string indicating the type of side network.
+    """
+
+    def __init__(self, sidetune_length=None, sidetune_type=None):
+        super(SideTune, self).__init__()
+        self.sidetune_length = sidetune_length
+        self.sidetune_type = sidetune_type
+        if sidetune_type.lower() == 'fcn4':
+            self.side = FCN4(out_dims=self.sidetune_length)
+        if sidetune_type.lower() == 'alexnet':
+            mm = torchvision.models.alexnet(pretrained=True)
+            self.side = nn.Sequential(
+                OrderedDict([
+                    ('features', mm.features), ('avgpool', mm.avgpool),
+                    ('flatten', nn.Flatten()),
+                    ('fc', nn.Linear(9216, self.sidetune_length, bias=False))
+                ]))
+        self.alpha = nn.Parameter(torch.tensor(0.0))
+
+    def forward(self, x, x_base):
+        alpha_squashed = torch.sigmoid(self.alpha)
+        x_side = self.side(x)
+        x_out = alpha_squashed * x_base + (1 - alpha_squashed) * x_side
+        return x_out
+
+
+class FCN4(nn.Module):
+    """The implementation of simple FCN4 network for side network.
+    """
+
+    def __init__(self, out_dims=-1, **kwargs):
+        super(FCN4, self).__init__(**kwargs)
+
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(
+                3,
+                16,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False,
+                dilation=1), nn.GroupNorm(2, 16), nn.ReLU())
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(
+                16,
+                16,
+                kernel_size=3,
+                stride=2,
+                padding=0,
+                bias=False,
+                dilation=1), nn.GroupNorm(2, 16), nn.ReLU())
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(
+                16,
+                32,
+                kernel_size=3,
+                stride=2,
+                padding=0,
+                bias=False,
+                dilation=1), nn.GroupNorm(2, 32), nn.ReLU())
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(
+                32,
+                64,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                bias=False,
+                dilation=1), nn.GroupNorm(2, 64), nn.ReLU())
+        self.pool = nn.AdaptiveAvgPool2d((1, 1))
+        if out_dims > 0:
+            self.fc = nn.Linear(64, out_dims)
+        else:
+            self.fc = None
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        x = self.pool(x)
+        x = x.view(x.size(0), -1)
+        if self.fc is not None:
+            x = self.fc(x)
+        return x
diff --git a/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py b/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py
index 629e7fac..03d1ae14 100644
--- a/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py
+++ b/modelscope/models/cv/vision_efficient_tuning/vision_efficient_tuning.py
@@ -1,65 +1,154 @@
 # Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os
+from collections import OrderedDict
 
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 
-from modelscope.metainfo import Models
-from modelscope.models.base.base_torch_model import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile
 
 
-@MODELS.register_module(
-    Tasks.vision_efficient_tuning, module_name=Models.vision_efficient_tuning)
-class VisionEfficientTuningModel(TorchModel):
+class VisionEfficientTuning(nn.Module):
     """ The implementation of vision efficient tuning.
 
     This model is constructed with the following parts:
         - 'backbone': pre-trained backbone model with parameters.
         - 'head': classification head with fine-tuning.
+        - 'loss': loss function for training.
     """
 
-    def __init__(self, model_dir: str, **kwargs):
+    def __init__(self,
+                 backbone=None,
+                 head=None,
+                 loss=None,
+                 pretrained=True,
+                 finetune=False,
+                 **kwargs):
         """ Initialize a vision efficient tuning model.
 
         Args:
-          model_dir: model id or path, where model_dir/pytorch_model.pt contains:
-                    - 'backbone_cfg': config of backbone.
-                    - 'backbone_weight': parameters of backbone.
-                    - 'head_cfg': config of head.
-                    - 'head_weight': parameters of head.
-                    - 'CLASSES': list of label name.
+          backbone: config of backbone.
+          head: config of head.
+          loss: config of loss.
+          pretrained: whether to load the pretrained model.
+          finetune: whether to finetune the model.
         """
-
         from .backbone import VisionTransformerPETL
         from .head import ClassifierHead
-        super().__init__(model_dir)
 
-        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
-        model_dict = torch.load(model_path)
+        super(VisionEfficientTuning, self).__init__()
 
-        backbone_cfg = model_dict['backbone_cfg']
-        if 'type' in backbone_cfg:
-            backbone_cfg.pop('type')
-        self.backbone_model = VisionTransformerPETL(**backbone_cfg)
-        self.backbone_model.load_state_dict(
-            model_dict['backbone_weight'], strict=True)
+        if backbone and 'type' in backbone:
+            backbone.pop('type')
+            self.backbone = VisionTransformerPETL(**backbone)
+        else:
+            self.backbone = None
 
-        head_cfg = model_dict['head_cfg']
-        if 'type' in head_cfg:
-            head_cfg.pop('type')
-        self.head_model = ClassifierHead(**head_cfg)
-        self.head_model.load_state_dict(model_dict['head_weight'], strict=True)
+        # TODO Use a more elegant method to build the model.
+        if head and 'type' in head:
+            head.pop('type')
+            self.head = ClassifierHead(**head)
+        else:
+            self.head = None
 
-        self.CLASSES = model_dict['CLASSES']
+        if loss and 'type' in loss:
+            self.loss = getattr(torch.nn, loss['type'])()
+        else:
+            self.loss = torch.nn.CrossEntropyLoss()
 
-    def forward(self, inputs):
+        self.CLASSES = kwargs.pop('CLASSES', None)
+        self.pretrained_cfg = kwargs.pop('pretrained_cfg', None)
+
+        if pretrained:
+            assert 'model_dir' in kwargs, 'pretrained model dir is missing.'
+            model_path = os.path.join(kwargs['model_dir'],
+                                      ModelFile.TORCH_MODEL_FILE)
+            model_dict = torch.load(model_path, map_location='cpu')
+
+            if self.backbone is None and 'backbone_cfg' in model_dict:
+                model_dict['backbone_cfg'].pop('type')
+                self.backbone = VisionTransformerPETL(
+                    **model_dict['backbone_cfg'])
+            if self.head is None and 'head_cfg' in model_dict:
+                model_dict['head_cfg'].pop('type')
+                self.head = ClassifierHead(**model_dict['head_cfg'])
+
+            if 'backbone_weight' in model_dict:
+                backbone_weight = model_dict['backbone_weight']
+                if finetune and self.pretrained_cfg and 'unload_part' in self.pretrained_cfg \
+                   and 'backbone' in self.pretrained_cfg['unload_part']:
+                    backbone_weight = self.filter_weight(
+                        backbone_weight,
+                        self.pretrained_cfg['unload_part']['backbone'])
+                self.backbone.load_state_dict(backbone_weight, strict=False)
+
+            if 'head_weight' in model_dict:
+                head_weight = model_dict['head_weight']
+                if finetune and self.pretrained_cfg and 'unload_part' in self.pretrained_cfg \
+                   and 'head' in self.pretrained_cfg['unload_part']:
+                    head_weight = self.filter_weight(
+                        head_weight,
+                        self.pretrained_cfg['unload_part']['head'])
+                self.head.load_state_dict(head_weight, strict=False)
+
+            self.CLASSES = model_dict[
+                'CLASSES'] if 'CLASSES' in model_dict else self.CLASSES
+
+    def filter_weight(self, weights, unload_part=[]):
+        """ Filter parameters that the model does not need to load.
+
+        Args:
+          weights: the parameters of the model.
+          unload_part: the config of unloading parameters.
+        """
+        ret_dict = {}
+        for key, value in weights.items():
+            flag = sum([p in key for p in unload_part]) > 0
+            if not flag:
+                ret_dict[key] = value
+        return ret_dict
+
+    def forward(self, imgs, labels=None, **kwargs):
         """ Dynamic forward function of vision efficient tuning.
 
         Args:
-          inputs: the input images (B, 3, H, W).
+            imgs: (B, 3, H, W).
+            labels: (B), when training stage.
         """
+        return self.forward_train(imgs, labels, **kwargs) \
+            if self.training else self.forward_test(imgs, labels, **kwargs)
 
-        backbone_output = self.backbone_model(inputs)
-        head_output = self.head_model(backbone_output)
-        return head_output
+    def forward_train(self, imgs, labels=None):
+        """ Dynamic forward function of training stage.
+
+        Args:
+            imgs: (B, 3, H, W).
+            labels: (B), when training stage.
+        """
+        output = OrderedDict()
+
+        backbone_output = self.backbone(imgs)
+        head_output = self.head(backbone_output)
+        loss = self.loss(head_output, labels)
+
+        output = {OutputKeys.LOSS: loss}
+        return output
+
+    def forward_test(self, imgs, labels=None):
+        """ Dynamic forward function of testing stage.
+
+        Args:
+            imgs: (B, 3, H, W).
+            labels: (B), when training stage.
+        """
+        output = OrderedDict()
+        backbone_output = self.backbone(imgs)
+        head_output = self.head(backbone_output)
+
+        scores = F.softmax(head_output, dim=1)
+        preds = scores.topk(1, 1, True, True)[-1].squeeze(-1)
+
+        output = {OutputKeys.SCORES: scores, OutputKeys.LABELS: preds}
+        return output
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index e4948310..f1c40e12 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -314,7 +314,7 @@ class MsDataset:
 
             def type_converter(self, x):
                 import torch
-                if self.to_tensor:
+                if self.to_tensor and not isinstance(x, torch.Tensor):
                     return torch.tensor(x)
                 else:
                     return x
diff --git a/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py b/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py
index 2e3c45cc..50289168 100644
--- a/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py
+++ b/modelscope/pipelines/cv/vision_efficient_tuning_pipeline.py
@@ -10,7 +10,7 @@ from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import LoadImage
+from modelscope.preprocessors import LoadImage, Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -40,25 +40,55 @@ class VisionEfficientTuningPipeline(Pipeline):
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.model = self.model.to(self.device)
         self.model.eval()
-        self.transform = transforms.Compose([
-            transforms.Resize(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-        ])
 
-    def preprocess(self, input: Input) -> Dict[str, Any]:
-        img = LoadImage.convert_to_img(input)
-        data = self.transform(img).unsqueeze(0).to(self.device)
-        return data
+        self.preprocessor = Preprocessor.from_pretrained(
+            self.model.model_dir, **kwargs)
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if self.preprocessor is None:
+            self.preprocessor = transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop((224, 224)),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+            ])
+
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
+        """ Preprocess method build from transforms or Preprocessor """
+        in_key = 'img_path:FILE'
+        other_in_keys = ['image']
+        out_key = 'imgs'
+        if isinstance(self.preprocessor, Preprocessor):
+            if not isinstance(inputs, dict):
+                inputs = {in_key: inputs}
+            elif in_key not in inputs:
+                for ik in other_in_keys:
+                    if ik in inputs and isinstance(inputs[ik], str):
+                        inputs = {in_key: inputs[ik]}
+                        break
+            data = self.preprocessor(inputs)
+            result = {out_key: data[out_key].unsqueeze(0).to(self.device)}
+        else:
+            if isinstance(inputs, dict):
+                for ik in [in_key] + other_in_keys:
+                    if ik in inputs:
+                        inputs = inputs[ik]
+                        break
+            img = LoadImage.convert_to_img(inputs)
+            data = self.preprocessor(img)
+            result = {out_key: data.unsqueeze(0).to(self.device)}
+        return result
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            results = self.model(input)
+            results = self.model(inputs)
             return results
 
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        scores = F.softmax(inputs, dim=1).cpu().numpy()
+    def postprocess(self, inputs: Dict[str, Any],
+                    **post_params) -> Dict[str, Any]:
+        """ Postprocess for classification """
+        scores = inputs[OutputKeys.SCORES].cpu().numpy()
         pred_scores = np.sort(scores, axis=1)[0][::-1][:5]
         pred_labels = np.argsort(scores, axis=1)[0][::-1][:5]
 
diff --git a/modelscope/preprocessors/common.py b/modelscope/preprocessors/common.py
index aa1db84c..68aaae36 100644
--- a/modelscope/preprocessors/common.py
+++ b/modelscope/preprocessors/common.py
@@ -7,6 +7,7 @@ from typing import Mapping
 import numpy as np
 import torch
 
+from modelscope.utils.registry import default_group
 from .builder import PREPROCESSORS, build_preprocessor
 
 
@@ -28,13 +29,14 @@ class Compose(object):
         for transform in transforms:
             if isinstance(transform, dict):
                 if self.field_name is None:
-                    transform = build_preprocessor(transform, field_name)
+                    transform = build_preprocessor(transform, default_group)
                 else:
                     # if not found key in field_name, try field_name=None(default_group)
                     try:
                         transform = build_preprocessor(transform, field_name)
                     except KeyError:
-                        transform = build_preprocessor(transform, None)
+                        transform = build_preprocessor(transform,
+                                                       default_group)
             elif callable(transform):
                 pass
             else:
@@ -108,7 +110,8 @@ class ToTensor(object):
                 self.keys = list(data.keys())
 
             for key in self.keys:
-                data[key] = to_tensor(data[key])
+                if key in data:
+                    data[key] = to_tensor(data[key])
         else:
             data = to_tensor(data)
 
@@ -135,9 +138,93 @@ class Filter(object):
 
         reserved_data = {}
         for key in self.reserved_keys:
-            reserved_data[key] = data[key]
+            if key in data:
+                reserved_data[key] = data[key]
 
         return reserved_data
 
     def __repr__(self):
         return self.__class__.__name__ + f'(keys={self.reserved_keys})'
+
+
+def to_numpy(data):
+    """Convert objects of various python types to `numpy.ndarray`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data.numpy()
+    elif isinstance(data, np.ndarray):
+        return data
+    elif isinstance(data, Sequence) and not isinstance(data, str):
+        return np.asarray(data)
+    elif isinstance(data, int):
+        return np.asarray(data, dtype=np.int64)
+    elif isinstance(data, float):
+        return np.asarray(data, dtype=np.float64)
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PREPROCESSORS.register_module()
+class ToNumpy(object):
+    """Convert target object to numpy.ndarray.
+
+    Args:
+        keys (Sequence[str]): Key of data to be converted to numpy.ndarray.
+            Only valid when data is type of `Mapping`. If `keys` is None,
+            all values of keys ​​will be converted to numpy.ndarray by default.
+    """
+
+    def __init__(self, keys=None):
+        self.keys = keys
+
+    def __call__(self, data):
+        if isinstance(data, Mapping):
+            if self.keys is None:
+                self.keys = list(data.keys())
+
+            for key in self.keys:
+                if key in data:
+                    data[key] = to_numpy(data[key])
+        else:
+            data = to_numpy(data)
+
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PREPROCESSORS.register_module()
+class Rename(object):
+    """Change the name of the input keys to output keys, respectively.
+    """
+
+    def __init__(self, input_keys=[], output_keys=[]):
+        self.input_keys = input_keys
+        self.output_keys = output_keys
+
+    def __call__(self, data):
+        if isinstance(data, Mapping):
+            for in_key, out_key in zip(self.input_keys, self.output_keys):
+                if in_key in data and out_key not in data:
+                    data[out_key] = data[in_key]
+                    data.pop(in_key)
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PREPROCESSORS.register_module()
+class Identity(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, item):
+        return item
diff --git a/modelscope/preprocessors/cv/__init__.py b/modelscope/preprocessors/cv/__init__.py
index b9165a9d..439ae822 100644
--- a/modelscope/preprocessors/cv/__init__.py
+++ b/modelscope/preprocessors/cv/__init__.py
@@ -12,6 +12,7 @@ if TYPE_CHECKING:
     from .image_restoration_preprocessor import ImageRestorationPreprocessor
     from .bad_image_detecting_preprocessor import BadImageDetectingPreprocessor
     from .controllable_image_generation import ControllableImageGenerationPreprocessor
+    from .image_classification_preprocessor import ImageClassificationPreprocessor
 
 else:
     _import_structure = {
@@ -24,6 +25,8 @@ else:
         'bad_image_detecting_preprocessor': ['BadImageDetectingPreprocessor'],
         'controllable_image_generation':
         ['ControllableImageGenerationPreprocessor'],
+        'image_classification_preprocessor':
+        ['ImageClassificationPreprocessor']
     }
 
     import sys
diff --git a/modelscope/preprocessors/cv/cv2_transforms.py b/modelscope/preprocessors/cv/cv2_transforms.py
new file mode 100644
index 00000000..cb8b8b1f
--- /dev/null
+++ b/modelscope/preprocessors/cv/cv2_transforms.py
@@ -0,0 +1,559 @@
+# The implementation is adopted from opencv_transforms,
+# made publicly available under the MIT license at
+# https://github.com/jbohnslav/opencv_transforms/blob/master/opencv_transforms/functional.py
+# https://github.com/jbohnslav/opencv_transforms/blob/master/opencv_transforms/transforms.py
+
+import collections
+import math
+import numbers
+import random
+
+import cv2
+import numpy as np
+import torch
+
+_cv2_pad_to_str = {
+    'constant': cv2.BORDER_CONSTANT,
+    'edge': cv2.BORDER_REPLICATE,
+    'reflect': cv2.BORDER_REFLECT_101,
+    'symmetric': cv2.BORDER_REFLECT
+}
+_cv2_interpolation_to_str = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'area': cv2.INTER_AREA,
+    'bicubic': cv2.INTER_CUBIC,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+_cv2_interpolation_from_str = {
+    v: k
+    for k, v in _cv2_interpolation_to_str.items()
+}
+
+
+def _is_tensor_image(img):
+    return torch.is_tensor(img) and img.ndimension() == 3
+
+
+def _is_numpy_image(img):
+    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})
+
+
+def to_tensor(pic):
+    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+    See ``ToTensor`` for more details.
+    Args:
+        pic (PIL Image or numpy.ndarray): Image to be converted to tensor.
+    Returns:
+        Tensor: Converted image.
+    """
+    if not (_is_numpy_image(pic)):
+        raise TypeError('pic should be ndarray. Got {}'.format(type(pic)))
+
+    # handle numpy array
+    img = torch.from_numpy(pic.transpose((2, 0, 1)))
+    # backward compatibility
+    if isinstance(img, torch.ByteTensor) or img.dtype == torch.uint8:
+        return img.float().div(255)
+    else:
+        return img
+
+
+def normalize(tensor, mean, std):
+    """Normalize a tensor image with mean and standard deviation.
+    .. note::
+        This transform acts in-place, i.e., it mutates the input tensor.
+    See :class:`~torchvision.transforms.Normalize` for more details.
+    Args:
+        tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channely.
+    Returns:
+        Tensor: Normalized Tensor image.
+    """
+    if not _is_tensor_image(tensor):
+        raise TypeError('tensor is not a torch image.')
+
+    # This is faster than using broadcasting, don't change without benchmarking
+    for t, m, s in zip(tensor, mean, std):
+        t.sub_(m).div_(s)
+    return tensor
+
+
+def resize(img, size, interpolation=cv2.INTER_LINEAR):
+    r"""Resize the input numpy ndarray to the given size.
+    Args:
+        img (numpy ndarray): Image to be resized.
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), the output size will be matched to this. If size is an int,
+            the smaller edge of the image will be matched to this number maintaing
+            the aspect ratio. i.e, if height > width, then image will be rescaled to
+            :math:`\left(\text{size} \times \frac{\text{height}}{\text{width}}, \text{size}\right)`
+        interpolation (int, optional): Desired interpolation. Default is
+            ``cv2.INTER_LINEAR``
+    Returns:
+        PIL Image: Resized image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy image. Got {}'.format(type(img)))
+    if not (isinstance(size, int) or  # noqa: W504
+            (isinstance(size, collections.abc.Iterable) and len(size) == 2)):
+        raise TypeError('Got inappropriate size arg: {}'.format(size))
+    h, w = img.shape[0], img.shape[1]
+
+    if isinstance(size, int):
+        if (w <= h and w == size) or (h <= w and h == size):
+            return img
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+    else:
+        ow, oh = size[1], size[0]
+    output = cv2.resize(img, dsize=(ow, oh), interpolation=interpolation)
+    if img.shape[2] == 1:
+        return output[:, :, np.newaxis]
+    else:
+        return output
+
+
+def pad(img, padding, fill=0, padding_mode='constant'):
+    r"""Pad the given numpy ndarray on all sides with specified padding mode and fill value.
+    Args:
+        img (numpy ndarray): image to be padded.
+        padding (int or tuple): Padding on each border. If a single int is provided this
+            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            this is the padding for the left, top, right and bottom borders
+            respectively.
+        fill: Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
+            - constant: pads with a constant value, this value is specified with fill
+            - edge: pads with the last value on the edge of the image
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+                       padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                       will result in [3, 2, 1, 2, 3, 4, 3, 2]
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+                         padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                         will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    Returns:
+        Numpy image: padded image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy ndarray. Got {}'.format(
+            type(img)))
+    if not isinstance(padding, (numbers.Number, tuple, list)):
+        raise TypeError('Got inappropriate padding arg')
+    if not isinstance(fill, (numbers.Number, str, tuple)):
+        raise TypeError('Got inappropriate fill arg')
+    if not isinstance(padding_mode, str):
+        raise TypeError('Got inappropriate padding_mode arg')
+    if isinstance(padding,
+                  collections.Sequence) and len(padding) not in [2, 4]:
+        raise ValueError(
+            'Padding must be an int or a 2, or 4 element tuple, not a '
+            + '{} element tuple'.format(len(padding)))
+
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
+        'Padding mode should be either constant, edge, reflect or symmetric'
+
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    if isinstance(padding, collections.Sequence) and len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    if isinstance(padding, collections.Sequence) and len(padding) == 4:
+        pad_left = padding[0]
+        pad_top = padding[1]
+        pad_right = padding[2]
+        pad_bottom = padding[3]
+    if img.shape[2] == 1:
+        return cv2.copyMakeBorder(
+            img,
+            top=pad_top,
+            bottom=pad_bottom,
+            left=pad_left,
+            right=pad_right,
+            borderType=_cv2_pad_to_str[padding_mode],
+            value=fill)[:, :, np.newaxis]
+    else:
+        return cv2.copyMakeBorder(
+            img,
+            top=pad_top,
+            bottom=pad_bottom,
+            left=pad_left,
+            right=pad_right,
+            borderType=_cv2_pad_to_str[padding_mode],
+            value=fill)
+
+
+def crop(img, i, j, h, w):
+    """Crop the given PIL Image.
+    Args:
+        img (numpy ndarray): Image to be cropped.
+        i: Upper pixel coordinate.
+        j: Left pixel coordinate.
+        h: Height of the cropped image.
+        w: Width of the cropped image.
+    Returns:
+        numpy ndarray: Cropped image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy image. Got {}'.format(type(img)))
+
+    return img[i:i + h, j:j + w, :]
+
+
+def center_crop(img, output_size):
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    h, w = img.shape[0:2]
+    th, tw = output_size
+    i = int(round((h - th) / 2.))
+    j = int(round((w - tw) / 2.))
+    return crop(img, i, j, th, tw)
+
+
+def resized_crop(img, i, j, h, w, size, interpolation=cv2.INTER_LINEAR):
+    """Crop the given numpy ndarray and resize it to desired size.
+    Notably used in :class:`~torchvision.transforms.RandomResizedCrop`.
+    Args:
+        img (numpy ndarray): Image to be cropped.
+        i: Upper pixel coordinate.
+        j: Left pixel coordinate.
+        h: Height of the cropped image.
+        w: Width of the cropped image.
+        size (sequence or int): Desired output size. Same semantics as ``scale``.
+        interpolation (int, optional): Desired interpolation. Default is
+            ``cv2.INTER_CUBIC``.
+    Returns:
+        PIL Image: Cropped image.
+    """
+    assert _is_numpy_image(img), 'img should be numpy image'
+    img = crop(img, i, j, h, w)
+    img = resize(img, size, interpolation=interpolation)
+    return img
+
+
+def hflip(img):
+    """Horizontally flip the given numpy ndarray.
+    Args:
+        img (numpy ndarray): image to be flipped.
+    Returns:
+        numpy ndarray:  Horizontally flipped image.
+    """
+    if not _is_numpy_image(img):
+        raise TypeError('img should be numpy image. Got {}'.format(type(img)))
+    # img[:,::-1] is much faster, but doesn't work with torch.from_numpy()!
+    if img.shape[2] == 1:
+        return cv2.flip(img, 1)[:, :, np.newaxis]
+    else:
+        return cv2.flip(img, 1)
+
+
+class ToTensor(object):
+    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
+    """
+
+    def __call__(self, pic):
+        """
+        Args:
+            pic (PIL Image or numpy.ndarray): Image to be converted to tensor.
+        Returns:
+            Tensor: Converted image.
+        """
+        return to_tensor(pic)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+
+
+class Normalize(object):
+    """Normalize a tensor image with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
+    will normalize each channel of the input ``torch.*Tensor`` i.e.
+    ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
+    .. note::
+        This transform acts in-place, i.e., it mutates the input tensor.
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+    """
+
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, tensor):
+        """
+        Args:
+            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        return normalize(tensor, self.mean, self.std)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1})'.format(
+            self.mean, self.std)
+
+
+class Resize(object):
+    """Resize the input numpy ndarray to the given size.
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size)
+        interpolation (int, optional): Desired interpolation. Default is
+            ``cv2.INTER_CUBIC``, bicubic interpolation
+    """
+
+    def __init__(self, size, interpolation=cv2.INTER_LINEAR):
+        # assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
+        if isinstance(size, int):
+            self.size = size
+        elif isinstance(size, collections.abc.Iterable) and len(size) == 2:
+            if type(size) == list:
+                size = tuple(size)
+            self.size = size
+        else:
+            raise ValueError('Unknown inputs for size: {}'.format(size))
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be scaled.
+        Returns:
+            numpy ndarray: Rescaled image.
+        """
+        return resize(img, self.size, self.interpolation)
+
+    def __repr__(self):
+        interpolate_str = _cv2_interpolation_from_str[self.interpolation]
+        return self.__class__.__name__ + '(size={0}, interpolation={1})'.format(
+            self.size, interpolate_str)
+
+
+class CenterCrop(object):
+    """Crops the given numpy ndarray at the center.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+    """
+
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be cropped.
+        Returns:
+            numpy ndarray: Cropped image.
+        """
+        return center_crop(img, self.size)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0})'.format(self.size)
+
+
+class RandomCrop(object):
+    """Crop the given numpy ndarray at a random location.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None, i.e no padding. If a sequence of length
+            4 is provided, it is used to pad left, top, right, bottom borders
+            respectively. If a sequence of length 2 is provided, it is used to
+            pad left/right, top/bottom borders, respectively.
+        pad_if_needed (boolean): It will pad the image if smaller than the
+            desired size to avoid raising an exception.
+        fill: Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
+             - constant: pads with a constant value, this value is specified with fill
+             - edge: pads with the last value on the edge of the image
+             - reflect: pads with reflection of image (without repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                will result in [3, 2, 1, 2, 3, 4, 3, 2]
+             - symmetric: pads with reflection of image (repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    def __init__(self,
+                 size,
+                 padding=None,
+                 pad_if_needed=False,
+                 fill=0,
+                 padding_mode='constant'):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.padding = padding
+        self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self.padding_mode = padding_mode
+
+    @staticmethod
+    def get_params(img, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img (numpy ndarray): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        h, w = img.shape[0:2]
+        th, tw = output_size
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        i = random.randint(0, h - th)
+        j = random.randint(0, w - tw)
+        return i, j, th, tw
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be cropped.
+        Returns:
+            numpy ndarray: Cropped image.
+        """
+        if self.padding is not None:
+            img = pad(img, self.padding, self.fill, self.padding_mode)
+
+        # pad the width if needed
+        if self.pad_if_needed and img.shape[1] < self.size[1]:
+            img = pad(img, (self.size[1] - img.shape[1], 0), self.fill,
+                      self.padding_mode)
+        # pad the height if needed
+        if self.pad_if_needed and img.shape[0] < self.size[0]:
+            img = pad(img, (0, self.size[0] - img.shape[0]), self.fill,
+                      self.padding_mode)
+
+        i, j, h, w = self.get_params(img, self.size)
+
+        return crop(img, i, j, h, w)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0}, padding={1})'.format(
+            self.size, self.padding)
+
+
+class RandomResizedCrop(object):
+    """Crop the given numpy ndarray to random size and aspect ratio.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
+    is finally resized to given size.
+    This is popularly used to train the Inception networks.
+    Args:
+        size: expected output size of each edge
+        scale: range of size of the origin size cropped
+        ratio: range of aspect ratio of the origin aspect ratio cropped
+        interpolation: Default: cv2.INTER_CUBIC
+    """
+
+    def __init__(self,
+                 size,
+                 scale=(0.08, 1.0),
+                 ratio=(3. / 4., 4. / 3.),
+                 interpolation=cv2.INTER_LINEAR):
+        self.size = (size, size)
+        self.interpolation = interpolation
+        self.scale = scale
+        self.ratio = ratio
+
+    @staticmethod
+    def get_params(img, scale, ratio):
+        """Get parameters for ``crop`` for a random sized crop.
+        Args:
+            img (numpy ndarray): Image to be cropped.
+            scale (tuple): range of size of the origin size cropped
+            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+                sized crop.
+        """
+        for attempt in range(10):
+            area = img.shape[0] * img.shape[1]
+            target_area = random.uniform(*scale) * area
+            aspect_ratio = random.uniform(*ratio)
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if random.random() < 0.5:
+                w, h = h, w
+
+            if w <= img.shape[1] and h <= img.shape[0]:
+                i = random.randint(0, img.shape[0] - h)
+                j = random.randint(0, img.shape[1] - w)
+                return i, j, h, w
+
+        # Fallback
+        w = min(img.shape[0], img.shape[1])
+        i = (img.shape[0] - w) // 2
+        j = (img.shape[1] - w) // 2
+        return i, j, w, w
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy ndarray): Image to be cropped and resized.
+        Returns:
+            numpy ndarray: Randomly cropped and resized image.
+        """
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        return resized_crop(img, i, j, h, w, self.size, self.interpolation)
+
+    def __repr__(self):
+        interpolate_str = _cv2_interpolation_from_str[self.interpolation]
+        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
+        format_string += ', scale={0}'.format(
+            tuple(round(s, 4) for s in self.scale))
+        format_string += ', ratio={0}'.format(
+            tuple(round(r, 4) for r in self.ratio))
+        format_string += ', interpolation={0})'.format(interpolate_str)
+        return format_string
+
+
+class RandomHorizontalFlip(object):
+    """Horizontally flip the given PIL Image randomly with a given probability.
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img):
+        """random
+        Args:
+            img (numpy ndarray): Image to be flipped.
+        Returns:
+            numpy ndarray: Randomly flipped image.
+        """
+        if random.random() < self.p:
+            return hflip(img)
+        return img
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(p={})'.format(self.p)
diff --git a/modelscope/preprocessors/cv/image_classification_preprocessor.py b/modelscope/preprocessors/cv/image_classification_preprocessor.py
new file mode 100644
index 00000000..fa98315b
--- /dev/null
+++ b/modelscope/preprocessors/cv/image_classification_preprocessor.py
@@ -0,0 +1,340 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+# The part implementation is also open-sourced by the authors,
+# and available at https://github.com/alibaba/EssentialMC2
+import os
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+
+import modelscope.preprocessors.cv.cv2_transforms as cv2_transforms
+from modelscope.fileio import File
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS, build_preprocessor
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.registry import default_group
+
+BACKEND_TORCHVISION = 'torchvision'
+BACKEND_PILLOW = 'pillow'
+BACKEND_CV2 = 'cv2'
+BACKENDS = (BACKEND_PILLOW, BACKEND_CV2, BACKEND_TORCHVISION)
+
+INTERPOLATION_STYLE = {
+    'bilinear': InterpolationMode('bilinear'),
+    'nearest': InterpolationMode('nearest'),
+    'bicubic': InterpolationMode('bicubic'),
+}
+INTERPOLATION_STYLE_CV2 = {
+    'bilinear': cv2.INTER_LINEAR,
+    'nearest': cv2.INTER_NEAREST,
+    'bicubic': cv2.INTER_CUBIC,
+}
+
+
+def is_pil_image(img):
+    return isinstance(img, Image.Image)
+
+
+def is_cv2_image(img):
+    return isinstance(img, np.ndarray) and img.dtype == np.uint8
+
+
+def is_tensor(t):
+    return isinstance(t, torch.Tensor)
+
+
+class ImageTransform(object):
+
+    def __init__(self,
+                 backend=BACKEND_PILLOW,
+                 input_key=None,
+                 output_key=None):
+        self.input_key = input_key or 'img'
+        self.output_key = output_key or 'img'
+        self.backend = backend
+
+    def check_image_type(self, input_img):
+        if self.backend == BACKEND_PILLOW:
+            assert is_pil_image(input_img), 'input should be PIL Image'
+        elif self.backend == BACKEND_CV2:
+            assert is_cv2_image(
+                input_img), 'input should be cv2 image(uint8 np.ndarray)'
+
+
+@PREPROCESSORS.register_module(Fields.cv)
+class RandomCrop(ImageTransform):
+    """ Crop a random portion of image.
+    If the image is torch Tensor, it is expected to have [..., H, W] shape.
+
+    Args:
+        size (sequence or int): Desired output size.
+            If size is a sequence like (h, w), the output size will be matched to this.
+            If size is an int, the output size will be matched to (size, size).
+        padding (sequence or int): Optional padding on each border of the image. Default is None.
+        pad_if_needed (bool): It will pad the image if smaller than the desired size to avoid raising an exception.
+        fill (number or str or tuple): Pixel fill value for constant fill. Default is 0.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+    """
+
+    def __init__(self,
+                 size,
+                 padding=None,
+                 pad_if_needed=False,
+                 fill=0,
+                 padding_mode='constant',
+                 **kwargs):
+
+        super(RandomCrop, self).__init__(**kwargs)
+        assert self.backend in BACKENDS
+        if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION):
+            self.callable = transforms.RandomCrop(
+                size,
+                padding=padding,
+                pad_if_needed=pad_if_needed,
+                fill=fill,
+                padding_mode=padding_mode)
+        else:
+            self.callable = cv2_transforms.RandomCrop(
+                size,
+                padding=padding,
+                pad_if_needed=pad_if_needed,
+                fill=fill,
+                padding_mode=padding_mode)
+
+    def __call__(self, item):
+        self.check_image_type(item[self.input_key])
+        item[self.output_key] = self.callable(item[self.input_key])
+        return item
+
+
+@PREPROCESSORS.register_module(Fields.cv)
+class RandomResizedCrop(ImageTransform):
+    """Crop a random portion of image and resize it to a given size.
+
+    If the image is torch Tensor, it is expected to have [..., H, W] shape.
+
+    Args:
+        size (int or sequence): Desired output size.
+            If size is a sequence like (h, w), the output size will be matched to this.
+            If size is an int, the output size will be matched to (size, size).
+        scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (str): Desired interpolation string, 'bilinear', 'nearest', 'bicubic' are supported.
+    """
+
+    def __init__(self,
+                 size,
+                 scale=(0.08, 1.0),
+                 ratio=(3. / 4., 4. / 3.),
+                 interpolation='bilinear',
+                 **kwargs):
+        super(RandomResizedCrop, self).__init__(**kwargs)
+        assert self.backend in BACKENDS
+        self.interpolation = interpolation
+        if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION):
+            assert interpolation in INTERPOLATION_STYLE
+        else:
+            assert interpolation in INTERPOLATION_STYLE_CV2
+        self.callable = transforms.RandomResizedCrop(size, scale, ratio, INTERPOLATION_STYLE[interpolation]) \
+            if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) \
+            else cv2_transforms.RandomResizedCrop(size, scale, ratio, INTERPOLATION_STYLE_CV2[interpolation])
+
+    def __call__(self, item):
+        self.check_image_type(item[self.input_key])
+        item[self.output_key] = self.callable(item[self.input_key])
+        return item
+
+
+@PREPROCESSORS.register_module(Fields.cv)
+class Resize(ImageTransform):
+    """Resize image to a given size.
+
+    If the image is torch Tensor, it is expected to have [..., H, W] shape.
+
+    Args:
+        size (int or sequence): Desired output size.
+            If size is a sequence like (h, w), the output size will be matched to this.
+            If size is an int, the smaller edge of the image will be matched to this
+            number maintaining the aspect ratio.
+        interpolation (str): Desired interpolation string, 'bilinear', 'nearest', 'bicubic' are supported.
+    """
+
+    def __init__(self, size, interpolation='bilinear', **kwargs):
+        super(Resize, self).__init__(**kwargs)
+        assert self.backend in BACKENDS
+        self.size = size
+        self.interpolation = interpolation
+        if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION):
+            assert interpolation in INTERPOLATION_STYLE
+        else:
+            assert interpolation in INTERPOLATION_STYLE_CV2
+        self.callable = transforms.Resize(size, INTERPOLATION_STYLE[interpolation]) \
+            if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) \
+            else cv2_transforms.Resize(size, INTERPOLATION_STYLE_CV2[interpolation])
+
+    def __call__(self, item):
+        self.check_image_type(item[self.input_key])
+        item[self.output_key] = self.callable(item[self.input_key])
+        return item
+
+
+@PREPROCESSORS.register_module(Fields.cv)
+class CenterCrop(ImageTransform):
+    """ Crops the given image at the center.
+
+    If the image is torch Tensor, it is expected to have [..., H, W] shape.
+
+    Args:
+        size (sequence or int): Desired output size.
+            If size is a sequence like (h, w), the output size will be matched to this.
+            If size is an int, the output size will be matched to (size, size).
+    """
+
+    def __init__(self, size, **kwargs):
+        super(CenterCrop, self).__init__(**kwargs)
+        assert self.backend in BACKENDS
+        self.size = size
+        self.callable = transforms.CenterCrop(size) \
+            if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) else cv2_transforms.CenterCrop(size)
+
+    def __call__(self, item):
+        self.check_image_type(item[self.input_key])
+        item[self.output_key] = self.callable(item[self.input_key])
+        return item
+
+
+@PREPROCESSORS.register_module(Fields.cv)
+class RandomHorizontalFlip(ImageTransform):
+    """ Horizontally flip the given image randomly with a given probability.
+
+    If the image is torch Tensor, it is expected to have [..., H, W] shape.
+
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5, **kwargs):
+        super(RandomHorizontalFlip, self).__init__(**kwargs)
+        assert self.backend in BACKENDS
+        self.callable = transforms.RandomHorizontalFlip(p) \
+            if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) else cv2_transforms.RandomHorizontalFlip(p)
+
+    def __call__(self, item):
+        self.check_image_type(item[self.input_key])
+        item[self.output_key] = self.callable(item[self.input_key])
+        return item
+
+
+@PREPROCESSORS.register_module(Fields.cv)
+class Normalize(ImageTransform):
+    """ Normalize a tensor image with mean and standard deviation.
+    This transform only support tensor image.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+    """
+
+    def __init__(self, mean, std, **kwargs):
+        super(Normalize, self).__init__(**kwargs)
+        assert self.backend in BACKENDS
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.callable = transforms.Normalize(self.mean, self.std) \
+            if self.backend in (BACKEND_PILLOW, BACKEND_TORCHVISION) else cv2_transforms.Normalize(self.mean, self.std)
+
+    def __call__(self, item):
+        item[self.output_key] = self.callable(item[self.input_key])
+        return item
+
+
+@PREPROCESSORS.register_module(Fields.cv)
+class ImageToTensor(ImageTransform):
+    """ Convert a ``PIL Image`` or ``numpy.ndarray`` or uint8 type tensor to a float32 tensor,
+    and scale output to [0.0, 1.0].
+    """
+
+    def __init__(self, **kwargs):
+        super(ImageToTensor, self).__init__(**kwargs)
+        assert self.backend in BACKENDS
+
+        if self.backend == BACKEND_PILLOW:
+            self.callable = transforms.ToTensor()
+        elif self.backend == BACKEND_CV2:
+            self.callable = cv2_transforms.ToTensor()
+        else:
+            self.callable = transforms.ConvertImageDtype(torch.float)
+
+    def __call__(self, item):
+        item[self.output_key] = self.callable(item[self.input_key])
+        return item
+
+
+def build_preprocess_pipeline(pipeline, group_name=Fields.cv):
+    if isinstance(pipeline, list):
+        if len(pipeline) == 0:
+            return build_preprocessor(
+                dict(type='Identity'), field_name=default_group)
+        elif len(pipeline) == 1:
+            return build_preprocess_pipeline(pipeline[0])
+        else:
+            return build_preprocessor(
+                dict(
+                    type='Compose', transforms=pipeline,
+                    field_name=group_name),
+                field_name=default_group)
+    elif isinstance(pipeline, dict):
+        return build_preprocessor(pipeline, field_name=group_name)
+    elif pipeline is None:
+        return build_preprocessor(
+            dict(type='Identity'), field_name=default_group)
+    else:
+        raise TypeError(
+            f'Expect pipeline_cfg to be dict or list or None, got {type(pipeline)}'
+        )
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.image_classification_preprocessor)
+class ImageClassificationPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        """image classification preprocessor in the fine-tune scenario
+        """
+        super().__init__(*args, **kwargs)
+
+        self.training = kwargs.pop('training', True)
+        self.preprocessor_train_cfg = kwargs.pop('train', None)
+        self.preprocessor_test_cfg = kwargs.pop('val', None)
+
+        if self.preprocessor_train_cfg is not None:
+            self.train_preprocess_pipeline = build_preprocess_pipeline(
+                self.preprocessor_train_cfg)
+
+        if self.preprocessor_test_cfg is not None:
+            self.test_preprocess_pipeline = build_preprocess_pipeline(
+                self.preprocessor_test_cfg)
+
+    def __call__(self, results: Dict[str, Any]):
+        """process the raw input data
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            Dict[str, Any] | None: the preprocessed data
+        """
+        if self.mode == ModeKeys.TRAIN:
+            pipline = self.train_preprocess_pipeline
+        else:
+            pipline = self.test_preprocess_pipeline
+
+        return pipline(results)
diff --git a/modelscope/preprocessors/image.py b/modelscope/preprocessors/image.py
index 666d2b29..36ab2f2f 100644
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -24,10 +24,12 @@ class LoadImage:
     "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
     Args:
         mode (str): See :ref:`PIL.Mode<https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes>`.
+        backend (str): Type of loading image. Should be: cv2 or pillow. Default is pillow.
     """
 
-    def __init__(self, mode='rgb'):
+    def __init__(self, mode='rgb', backend='pillow'):
         self.mode = mode.upper()
+        self.backend = backend
 
     def __call__(self, input: Union[str, Dict[str, str]]):
         """Call functions to load image and get image meta information.
@@ -42,21 +44,38 @@ class LoadImage:
         else:
             image_path_or_url = input
 
-        bytes = File.read(image_path_or_url)
-        # TODO @wenmeng.zwm add opencv decode as optional
-        # we should also look at the input format which is the most commonly
-        # used in Mind' image related models
-        with io.BytesIO(bytes) as infile:
-            img = Image.open(infile)
-            img = ImageOps.exif_transpose(img)
-            img = img.convert(self.mode)
+        if self.backend == 'cv2':
+            storage = File._get_storage(image_path_or_url)
+            with storage.as_local_path(image_path_or_url) as img_path:
+                img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+                if self.mode == 'RGB':
+                    cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+                img_h, img_w, img_c = img.shape[0], img.shape[1], img.shape[2]
+                img_shape = (img_h, img_w, img_c)
+        elif self.backend == 'pillow':
+            bytes = File.read(image_path_or_url)
+            # TODO @wenmeng.zwm add opencv decode as optional
+            # we should also look at the input format which is the most commonly
+            # used in Mind' image related models
+            with io.BytesIO(bytes) as infile:
+                img = Image.open(infile)
+                img = ImageOps.exif_transpose(img)
+                img = img.convert(self.mode)
+            img_shape = (img.size[1], img.size[0], 3)
+        else:
+            raise TypeError(f'backend should be either cv2 or pillow,'
+                            f'but got {self.backend}')
 
         results = {
             'filename': image_path_or_url,
             'img': img,
-            'img_shape': (img.size[1], img.size[0], 3),
+            'img_shape': img_shape,
             'img_field': 'img',
         }
+        if isinstance(input, dict):
+            input_ret = input.copy()
+            input_ret.update(results)
+            results = input_ret
         return results
 
     def __repr__(self):
diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py
index c31342ae..07a9440b 100644
--- a/modelscope/trainers/cv/__init__.py
+++ b/modelscope/trainers/cv/__init__.py
@@ -13,6 +13,7 @@ if TYPE_CHECKING:
     from .image_defrcn_fewshot_detection_trainer import ImageDefrcnFewshotTrainer
     from .cartoon_translation_trainer import CartoonTranslationTrainer
     from .nerf_recon_acc_trainer import NeRFReconAccTrainer
+    from .vision_efficient_tuning_trainer import VisionEfficientTuningTrainer
 
 else:
     _import_structure = {
@@ -28,6 +29,7 @@ else:
         ['ImageDefrcnFewshotTrainer'],
         'cartoon_translation_trainer': ['CartoonTranslationTrainer'],
         'nerf_recon_acc_trainer': ['NeRFReconAccTrainer'],
+        'vision_efficient_tuning_trainer': ['VisionEfficientTuningTrainer'],
     }
 
     import sys
diff --git a/modelscope/trainers/cv/vision_efficient_tuning_trainer.py b/modelscope/trainers/cv/vision_efficient_tuning_trainer.py
new file mode 100644
index 00000000..4c7dca73
--- /dev/null
+++ b/modelscope/trainers/cv/vision_efficient_tuning_trainer.py
@@ -0,0 +1,114 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import Union
+
+from torch import nn
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model, TorchModel
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.default_config import merge_hooks
+from modelscope.trainers.trainer import EpochBasedTrainer
+from modelscope.utils.constant import ModeKeys
+
+
+@TRAINERS.register_module(module_name=Trainers.vision_efficient_tuning)
+class VisionEfficientTuningTrainer(EpochBasedTrainer):
+    """ Vision Efficient Tuning Trainer based on EpochBasedTrainer
+
+    The trainer freezes the parameters of the pre-trained model and
+    tunes the extra parameters of the different parameter-efficient
+    transfer learning (PETL) method.
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def build_model(self) -> Union[nn.Module, TorchModel]:
+        """ Instantiate a pytorch model and return.
+
+        By default, we will create a model using config from configuration file. You can
+        override this method in a subclass.
+
+        """
+        model = Model.from_pretrained(self.model_dir, cfg_dict=self.cfg)
+        if 'freeze_cfg' in self.cfg['model']:
+            model = self.freeze(model, **self.cfg['model']['freeze_cfg'])
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            return model.model
+        elif isinstance(model, nn.Module):
+            return model
+
+    def train(self, *args, **kwargs):
+        self.print_model_params_status()
+        super().train(*args, **kwargs)
+
+    def evaluate(self, *args, **kwargs):
+        metric_values = super().evaluate(*args, **kwargs)
+        return metric_values
+
+    def freeze(self, model, freeze_part=[], train_part=[]):
+        """ Freeze or train the model based on the config.
+
+        Args:
+          model: the current model.
+          freeze_part: the config of frozen parameters.
+          train_part: the config of trainable parameters.
+        """
+        if hasattr(model, 'module'):
+            freeze_model = model.module
+        else:
+            freeze_model = model
+
+        if freeze_part and len(freeze_part) > 0:
+            if 'backbone' in freeze_part:
+                part = freeze_part['backbone']
+                for name, param in freeze_model.model.backbone.named_parameters(
+                ):
+                    freeze_flag = sum([p in name for p in part]) > 0
+                    if freeze_flag:
+                        param.requires_grad = False
+            elif 'head' in freeze_part:
+                part = freeze_part['head']
+                for name, param in freeze_model.model.head.named_parameters():
+                    freeze_flag = sum([p in name for p in part]) > 0
+                    if freeze_flag:
+                        param.requires_grad = False
+
+        if train_part and len(train_part) > 0:
+            if 'backbone' in train_part:
+                part = train_part['backbone']
+                for name, param in freeze_model.model.backbone.named_parameters(
+                ):
+                    freeze_flag = sum([p in name for p in part]) > 0
+                    if freeze_flag:
+                        param.requires_grad = True
+            elif 'head' in train_part:
+                part = train_part['head']
+                for name, param in freeze_model.model.head.named_parameters():
+                    freeze_flag = sum([p in name for p in part]) > 0
+                    if freeze_flag:
+                        param.requires_grad = True
+        return model
+
+    def print_model_params_status(self, model=None, logger=None):
+        """Print the status and parameters of the model"""
+        if model is None:
+            model = self.model
+        if logger is None:
+            logger = self.logger
+        train_param_dict = {}
+        all_param_numel = 0
+        for key, val in model.named_parameters():
+            if val.requires_grad:
+                sub_key = '.'.join(key.split('.', 1)[-1].split('.', 2)[:2])
+                if sub_key in train_param_dict:
+                    train_param_dict[sub_key] += val.numel()
+                else:
+                    train_param_dict[sub_key] = val.numel()
+            all_param_numel += val.numel()
+        train_param_numel = sum(train_param_dict.values())
+        logger.info(
+            f'Load trainable params {train_param_numel} / {all_param_numel} = '
+            f'{train_param_numel/all_param_numel:.2%}, '
+            f'train part: {train_param_dict}.')
diff --git a/modelscope/utils/chinese_utils.py b/modelscope/utils/chinese_utils.py
index 86cf91a2..77ea34ce 100644
--- a/modelscope/utils/chinese_utils.py
+++ b/modelscope/utils/chinese_utils.py
@@ -3,8 +3,6 @@
 import re
 import string
 
-from zhconv import convert
-
 CHINESE_PUNCTUATION = '＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。'
 ENGLISH_PUNCTUATION = string.punctuation
 
@@ -58,6 +56,8 @@ def _is_chinese_char(cp: str) -> bool:
 
 
 def normalize_chinese_number(text):
+    from zhconv import convert
+
     chinese_number = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九']
     new_text = ''
     for x in text:
diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
index 82bf1ada..99e61d45 100644
--- a/modelscope/utils/demo_utils.py
+++ b/modelscope/utils/demo_utils.py
@@ -30,6 +30,7 @@ TASKS_INPUT_TEMPLATES = {
     Tasks.ocr_detection: TasksIODescriptions.image_to_text,
     Tasks.ocr_recognition: TasksIODescriptions.image_to_text,
     Tasks.body_2d_keypoints: TasksIODescriptions.image_to_text,
+    Tasks.vision_efficient_tuning: TasksIODescriptions.image_to_text,
 
     # nlp tasks
     Tasks.text_classification: TasksIODescriptions.text_to_text,
diff --git a/tests/pipelines/test_vision_efficient_tuning.py b/tests/pipelines/test_vision_efficient_tuning.py
new file mode 100644
index 00000000..c88ed478
--- /dev/null
+++ b/tests/pipelines/test_vision_efficient_tuning.py
@@ -0,0 +1,154 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+from modelscope.models import Model
+from modelscope.models.cv.vision_efficient_tuning.model import \
+    VisionEfficientTuningModel
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VisionEfficientTuningTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.vision_efficient_tuning
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_adapter_run_pipeline(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter'
+        img_path = 'data/test/images/vision_efficient_tuning_test_1.png'
+        petl_pipeline = pipeline(self.task, model_id)
+        result = petl_pipeline(img_path)
+        print(f'Vision-efficient-tuning-adapter output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_vision_efficient_tuning_adapter_load_model_from_pretrained(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter'
+        model = Model.from_pretrained(model_id)
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_adapter_demo_compatibility(self):
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter'
+        self.compatibility_check()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_lora_run_pipeline(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora'
+        img_path = 'data/test/images/vision_efficient_tuning_test_1.png'
+        petl_pipeline = pipeline(self.task, model_id)
+        result = petl_pipeline(img_path)
+        print(f'Vision-efficient-tuning-lora output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_vision_efficient_tuning_lora_load_model_from_pretrained(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora'
+        model = Model.from_pretrained(model_id)
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_lora_demo_compatibility(self):
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora'
+        self.compatibility_check()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_prefix_run_pipeline(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix'
+        img_path = 'data/test/images/vision_efficient_tuning_test_1.png'
+        petl_pipeline = pipeline(self.task, model_id)
+        result = petl_pipeline(img_path)
+        print(f'Vision-efficient-tuning-prefix output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_vision_efficient_tuning_prefix_load_model_from_pretrained(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix'
+        model = Model.from_pretrained(model_id)
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_prefix_demo_compatibility(self):
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix'
+        self.compatibility_check()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_prompt_run_pipeline(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt'
+        img_path = 'data/test/images/vision_efficient_tuning_test_1.png'
+        petl_pipeline = pipeline(self.task, model_id)
+        result = petl_pipeline(img_path)
+        print(f'Vision-efficient-tuning-prompt output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_vision_efficient_tuning_prompt_load_model_from_pretrained(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt'
+        model = Model.from_pretrained(model_id)
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_prompt_demo_compatibility(self):
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt'
+        self.compatibility_check()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_bitfit_run_pipeline(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit'
+        img_path = 'data/test/images/vision_efficient_tuning_test_1.png'
+        petl_pipeline = pipeline(self.task, model_id)
+        result = petl_pipeline(img_path)
+        print(f'Vision-efficient-tuning-bitfit output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_vision_efficient_tuning_bitfit_load_model_from_pretrained(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit'
+        model = Model.from_pretrained(model_id)
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_bitfit_demo_compatibility(self):
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit'
+        self.compatibility_check()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_sidetuning_run_pipeline(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning'
+        img_path = 'data/test/images/vision_efficient_tuning_test_1.png'
+        petl_pipeline = pipeline(self.task, model_id)
+        result = petl_pipeline(img_path)
+        print(f'Vision-efficient-tuning-sidetuning output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_vision_efficient_tuning_sidetuning_load_model_from_pretrained(
+            self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning'
+        model = Model.from_pretrained(model_id)
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_sidetuning_demo_compatibility(self):
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning'
+        self.compatibility_check()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_utuning_run_pipeline(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning'
+        img_path = 'data/test/images/vision_efficient_tuning_test_1.png'
+        petl_pipeline = pipeline(self.task, model_id)
+        result = petl_pipeline(img_path)
+        print(f'Vision-efficient-tuning-utuning output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_vision_efficient_tuning_utuning_load_model_from_pretrained(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning'
+        model = Model.from_pretrained(model_id)
+        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_utuning_demo_compatibility(self):
+        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning'
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_vision_efficient_tuning_adapter.py b/tests/pipelines/test_vision_efficient_tuning_adapter.py
deleted file mode 100644
index 4a06a40a..00000000
--- a/tests/pipelines/test_vision_efficient_tuning_adapter.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
-import unittest
-
-from modelscope.models import Model
-from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \
-    VisionEfficientTuningModel
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.test_utils import test_level
-
-
-class VisionEfficientTuningAdapterTest(unittest.TestCase,
-                                       DemoCompatibilityCheck):
-
-    def setUp(self) -> None:
-        self.task = Tasks.vision_efficient_tuning
-        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter'
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_pipeline(self):
-
-        petl_pipeline = pipeline(self.task, self.model_id)
-        result = petl_pipeline(
-            'data/test/images/vision_efficient_tuning_test_1.png')
-
-        print(f'Vision-efficient-tuning-adapter output: {result}.')
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_load_model_from_pretrained(self):
-        model = Model.from_pretrained(
-            'damo/cv_vitb16_classification_vision-efficient-tuning-adapter')
-        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/pipelines/test_vision_efficient_tuning_lora.py b/tests/pipelines/test_vision_efficient_tuning_lora.py
deleted file mode 100644
index 6c49453a..00000000
--- a/tests/pipelines/test_vision_efficient_tuning_lora.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
-import unittest
-
-from modelscope.models import Model
-from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \
-    VisionEfficientTuningModel
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.test_utils import test_level
-
-
-class VisionEfficientTuningLoRATest(unittest.TestCase, DemoCompatibilityCheck):
-
-    def setUp(self) -> None:
-        self.task = Tasks.vision_efficient_tuning
-        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora'
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_pipeline(self):
-
-        petl_pipeline = pipeline(self.task, self.model_id)
-        result = petl_pipeline(
-            'data/test/images/vision_efficient_tuning_test_1.png')
-
-        print(f'Vision-efficient-tuning-lora output: {result}.')
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_load_model_from_pretrained(self):
-        model = Model.from_pretrained(
-            'damo/cv_vitb16_classification_vision-efficient-tuning-lora')
-        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/pipelines/test_vision_efficient_tuning_prefix.py b/tests/pipelines/test_vision_efficient_tuning_prefix.py
deleted file mode 100644
index 0eca5819..00000000
--- a/tests/pipelines/test_vision_efficient_tuning_prefix.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
-import unittest
-
-from modelscope.models import Model
-from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \
-    VisionEfficientTuningModel
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.test_utils import test_level
-
-
-class VisionEfficientTuningPrefixTest(unittest.TestCase,
-                                      DemoCompatibilityCheck):
-
-    def setUp(self) -> None:
-        self.task = Tasks.vision_efficient_tuning
-        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix'
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_pipeline(self):
-
-        petl_pipeline = pipeline(self.task, self.model_id)
-        result = petl_pipeline(
-            'data/test/images/vision_efficient_tuning_test_1.png')
-
-        print(f'Vision-efficient-tuning-prefix output: {result}.')
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_load_model_from_pretrained(self):
-        model = Model.from_pretrained(
-            'damo/cv_vitb16_classification_vision-efficient-tuning-prefix')
-        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/pipelines/test_vision_efficient_tuning_prompt.py b/tests/pipelines/test_vision_efficient_tuning_prompt.py
deleted file mode 100644
index 97d97811..00000000
--- a/tests/pipelines/test_vision_efficient_tuning_prompt.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
-import unittest
-
-from modelscope.models import Model
-from modelscope.models.cv.vision_efficient_tuning.vision_efficient_tuning import \
-    VisionEfficientTuningModel
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.test_utils import test_level
-
-
-class VisionEfficientTuningPromptTest(unittest.TestCase,
-                                      DemoCompatibilityCheck):
-
-    def setUp(self) -> None:
-        self.task = Tasks.vision_efficient_tuning
-        self.model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt'
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_pipeline(self):
-
-        petl_pipeline = pipeline(self.task, self.model_id)
-        result = petl_pipeline(
-            'data/test/images/vision_efficient_tuning_test_1.png')
-
-        print(f'Vision-efficient-tuning-prompt output: {result}.')
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_load_model_from_pretrained(self):
-        model = Model.from_pretrained(
-            'damo/cv_vitb16_classification_vision-efficient-tuning-prompt')
-        self.assertTrue(model.__class__ == VisionEfficientTuningModel)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/trainers/test_finetune_vision_efficient_tuning.py b/tests/trainers/test_finetune_vision_efficient_tuning.py
new file mode 100644
index 00000000..8719c64f
--- /dev/null
+++ b/tests/trainers/test_finetune_vision_efficient_tuning.py
@@ -0,0 +1,355 @@
+# Copyright 2022-2023 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.test_utils import test_level
+
+
+class TestVisionEfficientTuningTrainer(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+        self.train_dataset = MsDataset.load(
+            'foundation_model_evaluation_benchmark',
+            namespace='damo',
+            subset_name='OxfordFlowers',
+            split='train')
+
+        self.eval_dataset = MsDataset.load(
+            'foundation_model_evaluation_benchmark',
+            namespace='damo',
+            subset_name='OxfordFlowers',
+            split='eval')
+
+        self.max_epochs = 1
+        self.num_classes = 102
+        self.tune_length = 10
+
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_adapter_train(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter'
+
+        def cfg_modify_fn(cfg):
+            cfg.model.head.num_classes = self.num_classes
+            cfg.model.finetune = True
+            cfg.train.max_epochs = self.max_epochs
+            cfg.train.lr_scheduler.T_max = self.max_epochs
+            cfg.model.backbone.adapter_length = self.tune_length
+            return cfg
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        trainer.train()
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-adapter train output: {result}.')
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_adapter_eval(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-adapter'
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=None,
+            eval_dataset=self.eval_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-adapter eval output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_lora_train(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora'
+
+        def cfg_modify_fn(cfg):
+            cfg.model.head.num_classes = self.num_classes
+            cfg.model.finetune = True
+            cfg.train.max_epochs = self.max_epochs
+            cfg.train.lr_scheduler.T_max = self.max_epochs
+            cfg.model.backbone.lora_length = self.tune_length
+            return cfg
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        trainer.train()
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-lora train output: {result}.')
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_lora_eval(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-lora'
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=None,
+            eval_dataset=self.eval_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-lora eval output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_prefix_train(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix'
+
+        def cfg_modify_fn(cfg):
+            cfg.model.head.num_classes = self.num_classes
+            cfg.model.finetune = True
+            cfg.train.max_epochs = self.max_epochs
+            cfg.train.lr_scheduler.T_max = self.max_epochs
+            cfg.model.backbone.prefix_length = self.tune_length
+            return cfg
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        trainer.train()
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-prefix train output: {result}.')
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_prefix_eval(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prefix'
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=None,
+            eval_dataset=self.eval_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-prefix eval output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_prompt_train(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt'
+
+        def cfg_modify_fn(cfg):
+            cfg.model.head.num_classes = self.num_classes
+            cfg.model.finetune = True
+            cfg.train.max_epochs = self.max_epochs
+            cfg.train.lr_scheduler.T_max = self.max_epochs
+            cfg.model.backbone.prompt_length = self.tune_length
+            return cfg
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        trainer.train()
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-prompt train output: {result}.')
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_prompt_eval(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-prompt'
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=None,
+            eval_dataset=self.eval_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-prompt eval output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_bitfit_train(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit'
+
+        # model_id = '../modelcard/cv_vitb16_classification_vision-efficient-tuning-bitfit'
+        def cfg_modify_fn(cfg):
+            cfg.model.head.num_classes = self.num_classes
+            cfg.model.finetune = True
+            cfg.train.max_epochs = self.max_epochs
+            cfg.train.lr_scheduler.T_max = self.max_epochs
+            return cfg
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        trainer.train()
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-bitfit train output: {result}.')
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_bitfit_eval(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-bitfit'
+        # model_id = '../modelcard/cv_vitb16_classification_vision-efficient-tuning-bitfit'
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=None,
+            eval_dataset=self.eval_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-bitfit eval output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_sidetuning_train(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning'
+
+        def cfg_modify_fn(cfg):
+            cfg.model.head.num_classes = self.num_classes
+            cfg.model.finetune = True
+            cfg.train.max_epochs = self.max_epochs
+            cfg.train.lr_scheduler.T_max = self.max_epochs
+            return cfg
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        trainer.train()
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-sidetuning train output: {result}.')
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_sidetuning_eval(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-sidetuning'
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=None,
+            eval_dataset=self.eval_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-sidetuning eval output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_utuning_train(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning'
+
+        def cfg_modify_fn(cfg):
+            cfg.model.head.num_classes = self.num_classes
+            cfg.model.finetune = True
+            cfg.train.max_epochs = self.max_epochs
+            cfg.train.lr_scheduler.T_max = self.max_epochs
+            return cfg
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        trainer.train()
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-utuning train output: {result}.')
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_vision_efficient_tuning_utuning_eval(self):
+        model_id = 'damo/cv_vitb16_classification_vision-efficient-tuning-utuning'
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=None,
+            eval_dataset=self.eval_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.vision_efficient_tuning, default_args=kwargs)
+        result = trainer.evaluate()
+        print(f'Vision-efficient-tuning-utuning eval output: {result}.')
+
+
+if __name__ == '__main__':
+    unittest.main()