add 6 ofa tasks

2026-02-24 12:10:09 +01:00 · 2022-07-28 23:25:17 +08:00
parent ca4b5b2565
commit d1d2c96dd9
41 changed files with 2290 additions and 270 deletions
--- a/data/test/images/image_classification.png
+++ b/data/test/images/image_classification.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bdb9627c3a40897e84ee186b2a959f272790571644224e1d2efca443f867e12
+size 202823
--- a/data/test/images/visual_grounding.png
+++ b/data/test/images/visual_grounding.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b89734b9c9d89342e58fbe406d3b9bdc8e07447cb170a4ae2743000471fc969
+size 23069
--- a/data/test/images/visual_question_answering.png
+++ b/data/test/images/visual_question_answering.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d53e9fbdd129b234dcbec9b9fe6a15a0e05820e802a873f95955574267bbd2ff
+size 121141
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -69,6 +69,7 @@ class Pipelines(object):
    action_recognition = 'TAdaConv_action-recognition'
    animal_recognation = 'resnet101-animal_recog'
    cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding'
+    image_classification = 'image-classification'
    face_detection = 'resnet-face-detection-scrfd10gkps'
    live_category = 'live-category'
    general_image_classification = 'vit-base_image-classification_ImageNet-labels'
@@ -92,6 +93,7 @@ class Pipelines(object):
    text_generation = 'text-generation'
    sentiment_analysis = 'sentiment-analysis'
    sentiment_classification = 'sentiment-classification'
+    text_classification = 'text-classification'
    fill_mask = 'fill-mask'
    csanmt_translation = 'csanmt-translation'
    nli = 'nli'
@@ -113,6 +115,8 @@ class Pipelines(object):
    multi_modal_embedding = 'multi-modal-embedding'
    generative_multi_modal_embedding = 'generative-multi-modal-embedding'
    visual_question_answering = 'visual-question-answering'
+    visual_grounding = 'visual-grounding'
+    visual_entailment = 'visual-entailment'
    text_to_image_synthesis = 'text-to-image-synthesis'
    video_multi_modal_embedding = 'video-multi-modal-embedding'

--- a/modelscope/models/multi_modal/init.py
+++ b/modelscope/models/multi_modal/init.py
@@ -11,7 +11,6 @@ if TYPE_CHECKING:
    from .mmr import VideoCLIPForMultiModalEmbedding
    from .mplug_for_visual_question_answering import \
        MPlugForVisualQuestionAnswering
-    from .ofa_for_image_captioning_model import OfaForImageCaptioning

 else:
    _import_structure = {
@@ -21,7 +20,7 @@ else:
        'mmr': ['VideoCLIPForMultiModalEmbedding'],
        'mplug_for_visual_question_answering':
        ['MPlugForVisualQuestionAnswering'],
-        'ofa_for_image_captioning_model': ['OfaForImageCaptioning']
+        'ofa_for_all_tasks': ['OfaForAllTasks']
    }

    import sys
--- a/modelscope/models/multi_modal/image_captioning_model.py
+++ b/modelscope/models/multi_modal/image_captioning_model.py
@@ -1,86 +0,0 @@
-import os.path as osp
-from typing import Any, Dict
-
-import torch.cuda
-from PIL import Image
-
-from modelscope.metainfo import Models
-from modelscope.models.base import Model
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import ModelFile, Tasks
-
-__all__ = ['OfaForImageCaptioning']
-
-
-@MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
-class OfaForImageCaptioning(Model):
-
-    def __init__(self, model_dir, *args, **kwargs):
-        super().__init__(model_dir=model_dir, *args, **kwargs)
-        ckpt_name = ModelFile.TORCH_MODEL_FILE
-        local_model = osp.join(model_dir, ckpt_name)
-        bpe_dir = model_dir
-        # turn on cuda if GPU is available
-        from fairseq import checkpoint_utils, tasks, utils
-        from ofa.tasks.mm_tasks import CaptionTask
-        from ofa.utils.eval_utils import eval_caption
-        self.eval_caption = eval_caption
-        tasks.register_task('caption', CaptionTask)
-        if torch.cuda.is_available():
-            self._device = torch.device('cuda')
-        else:
-            self._device = torch.device('cpu')
-        self.use_fp16 = kwargs[
-            'use_fp16'] if 'use_fp16' in kwargs and torch.cuda.is_available()\
-            else False
-        overrides = {
-            'bpe_dir': bpe_dir,
-            'eval_cider': False,
-            'beam': 5,
-            'max_len_b': 16,
-            'no_repeat_ngram_size': 3,
-            'seed': 7
-        }
-        models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
-            utils.split_paths(local_model), arg_overrides=overrides)
-        # Move models to GPU
-        for model in models:
-            model.eval()
-            model.to(self._device)
-            if self.use_fp16:
-                model.half()
-            model.prepare_for_inference_(cfg)
-        self.models = models
-        # Initialize generator
-        self.generator = task.build_generator(models, cfg.generation)
-
-        # Initialize transform
-        from torchvision import transforms
-        mean = [0.5, 0.5, 0.5]
-        std = [0.5, 0.5, 0.5]
-
-        self.patch_resize_transform = transforms.Compose([
-            lambda image: image.convert('RGB'),
-            transforms.Resize(
-                (cfg.task.patch_image_size, cfg.task.patch_image_size),
-                interpolation=Image.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=mean, std=std),
-        ])
-        self.task = task
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        import fairseq.utils
-        if torch.cuda.is_available():
-            input = fairseq.utils.move_to_cuda(input, device=self._device)
-        results, _ = self.eval_caption(self.task, self.generator, self.models,
-                                       input)
-        from modelscope.outputs import OutputKeys
-        return {
-            'image_id': results[0]['image_id'],
-            OutputKeys.CAPTION: results[0][OutputKeys.CAPTION]
-        }
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        # What should we do here ?
-        return inputs
--- a/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
+++ b/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
@@ -194,13 +194,6 @@ class SequenceGenerator(nn.Module):
        bos_token: Optional[int] = None,
    ):
        model = EnsembleModel(models)
-        # incremental_states = torch.jit.annotate(
-        #     List[Dict[str, Dict[str, Optional[Tensor]]]],
-        #     [
-        #         torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
-        #         for i in range(model.models_size)
-        #     ],
-        # )
        incremental_states = torch.jit.annotate(
            List[Tuple[Tuple[torch.Tensor]]],
            [
@@ -208,8 +201,6 @@ class SequenceGenerator(nn.Module):
                for i in range(model.models_size)
            ],
        )
-        # print("incremental_states",incremental_states)
-        # print("incremental_states[0]",incremental_states[0])
        net_input = sample['net_input']

        if 'src_tokens' in net_input:
@@ -281,7 +272,6 @@ class SequenceGenerator(nn.Module):
        tokens = (torch.zeros(bsz * beam_size,
                              max_len + 2).to(src_tokens).long().fill_(
                                  self.pad))  # +2 for eos and pad
-        # tokens[:, 0] = self.eos if bos_token is None else bos_token
        tokens[:, 0] = self.bos
        attn: Optional[Tensor] = None

@@ -335,7 +325,7 @@ class SequenceGenerator(nn.Module):
                        corr.unsqueeze(-1) * beam_size)
                    original_batch_idxs = original_batch_idxs[batch_idxs]
                model.reorder_incremental_state(incremental_states,
-                                                reorder_state)  # todo
+                                                reorder_state)
                encoder_outs = model.reorder_encoder_out(
                    encoder_outs, reorder_state)

@@ -479,7 +469,6 @@ class SequenceGenerator(nn.Module):
                batch_mask = torch.ones(
                    bsz, dtype=torch.bool, device=cand_indices.device)
                batch_mask[finalized_sents] = False
-                # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it
                batch_idxs = torch.arange(
                    bsz, device=cand_indices.device).masked_select(batch_mask)

@@ -833,7 +822,7 @@ class EnsembleModel(nn.Module):

            # decode each model
            if self.has_incremental_states():
-                decoder_out = model.decoder.forward(  # todo 模型输入不同
+                decoder_out = model.decoder.forward(
                    input_ids=tokens,
                    attention_mask=attention_mask,
                    encoder_hidden_states=encoder_hidden_states,
@@ -846,7 +835,7 @@ class EnsembleModel(nn.Module):
            else:
                if hasattr(model, 'decoder'):
                    # decoder_out = model.decoder.forward(tokens, code_masks=code_mask, encoder_out=encoder_out)
-                    decoder_out = model.decoder.forward(  # todo 模型输入不同
+                    decoder_out = model.decoder.forward(
                        input_ids=tokens,
                        attention_mask=attention_mask,
                        encoder_hidden_states=encoder_hidden_states,
@@ -855,32 +844,9 @@ class EnsembleModel(nn.Module):
                        src_pos_embed=src_pos_embed)
                else:
                    decoder_out = model.forward(tokens)
-            # print('#### decoder_out ####', decoder_out)
-            # print('#### decoder_out ####', decoder_out.keys())
-            # for k,v in decoder_out.items():
-            #     print(k)
-            #     if isinstance(v, Tensor):
-            #         print(v.shape)
-            #     elif k == "past_key_values":
-            #         print(len(v))
-            #         print([v[0][i].shape for i in range(len(v[0]))])
-            #     else:
-            #         print(len(v))
-            #         print([v[i].shape for i in range(len(v))])

            attn: Optional[Tensor] = None
            decoder_len = len(decoder_out)
-            # if decoder_len > 1 and decoder_out[1] is not None:
-            #     if isinstance(decoder_out[1], Tensor):
-            #         attn = decoder_out[1]
-            #     else:
-            #         attn_holder = decoder_out[1]["attn"]
-            #         if isinstance(attn_holder, Tensor):
-            #             attn = attn_holder
-            #         elif attn_holder is not None:
-            #             attn = attn_holder[0]
-            #     if attn is not None:
-            #         attn = attn[:, -1, :]

            if 'cross_attentions' in decoder_out:
                attn = decoder_out['cross_attentions'][-1].transpose(1, 0)
@@ -888,11 +854,6 @@ class EnsembleModel(nn.Module):
                if attn is not None:
                    attn = attn[:, -1, :]

-            # decoder_out_tuple = (
-            #     decoder_out[0][:, -1:, :].div_(temperature),
-            #     None if decoder_len <= 1 else decoder_out[1],
-            # )
-
            decoder_out_tuple = (
                decoder_out[0][:, -1:, :].div_(temperature),
                None if decoder_len <= 1 else attn,
@@ -993,5 +954,5 @@ class EnsembleModel(nn.Module):
        if not self.has_incremental_states():
            return
        for i, model in enumerate(self.models):
-            model.decoder.reorder_incremental_state_scripting(  # todo
+            model.decoder.reorder_incremental_state_scripting(
                incremental_states[i], new_order)
--- a/modelscope/models/multi_modal/ofa/utils/constant.py
+++ b/modelscope/models/multi_modal/ofa/utils/constant.py
@@ -0,0 +1,13 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+OFA_TASK_KEY_MAPPING = {
+    Tasks.image_captioning: OutputKeys.CAPTION,
+    Tasks.summarization: OutputKeys.TEXT,
+    Tasks.visual_question_answering: OutputKeys.TEXT,
+    Tasks.visual_grounding: OutputKeys.BOXES,
+    Tasks.text_classification: (OutputKeys.SCORES, OutputKeys.LABELS),
+    Tasks.image_classification: OutputKeys.LABELS,
+    Tasks.visual_entailment: (OutputKeys.SCORES, OutputKeys.LABELS),
+}
--- a/modelscope/models/multi_modal/ofa/utils/utils.py
+++ b/modelscope/models/multi_modal/ofa/utils/utils.py
@@ -0,0 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Optional
+
+import torch
+
+
+def expand_mask(mask: torch.Tensor,
+                dtype: torch.dtype,
+                tgt_len: Optional[int] = None):
+    r"""
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
+                                                  src_len).to(dtype)
+    return expanded_mask.masked_fill(expanded_mask.bool(),
+                                     torch.finfo(dtype).min)
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -0,0 +1,259 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+from os import path as osp
+from typing import Any, Dict
+
+import json
+import torch.cuda
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model, Tensor
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.ofa.utils.collate import collate_tokens
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.trie import Trie
+from .ofa import OFAModel, OFATokenizer
+from .ofa.generate import sequence_generator as sg
+from .ofa.generate.utils import move_to_device
+from .ofa.utils.constant import OFA_TASK_KEY_MAPPING, Tasks
+from .ofa.utils.utils import expand_mask
+
+__all__ = ['OfaForAllTasks']
+
+
+@MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
+@MODELS.register_module(Tasks.visual_grounding, module_name=Models.ofa)
+@MODELS.register_module(
+    Tasks.visual_question_answering, module_name=Models.ofa)
+@MODELS.register_module(Tasks.visual_entailment, module_name=Models.ofa)
+@MODELS.register_module(Tasks.image_classification, module_name=Models.ofa)
+@MODELS.register_module(Tasks.summarization, module_name=Models.ofa)
+@MODELS.register_module(Tasks.text_classification, module_name=Models.ofa)
+class OfaForAllTasks(Model):
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir=model_dir, *args, **kwargs)
+        model = OFAModel.from_pretrained(model_dir)
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+        self.model = model.module if hasattr(model, 'module') else model
+        self.tokenizer = OFATokenizer.from_pretrained(model_dir)
+        self.tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)])
+        self.tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)])
+        self.cfg.update({'num_bins': 1000, 'num_codes': 8192})
+        self.batch_size = self.cfg.model.get('batch_size', 1)
+        self.val_batch_size = self.cfg.model.get('valid_batch_size',
+                                                 self.batch_size)
+        self.gen_type = self.cfg.model.get('gen_type', 'generation')
+        assert self.gen_type in ['generation', 'traverse'], \
+            'model.gen_type must be in ["generation", "traverse"]'
+        self._device = torch.device('cuda') if torch.cuda.is_available() \
+            else torch.device('cpu')
+        self.eos_item = torch.LongTensor([self.tokenizer.eos_token_id
+                                          ]).to(self._device)
+        self.index2ans = {}
+        self.ans2label_dict = {}
+        self.load_ans2label()
+        # Initialize generator
+        sg_args = {
+            'tokenizer': self.tokenizer,
+            'beam_size': 5,
+            'max_len_b': 16,
+            'min_len': 1,
+            'no_repeat_ngram_size': 3,
+            'constraint_range': None
+        }
+        if hasattr(self.cfg.model, 'beam_search'):
+            sg_args.update(self.cfg.model.beam_search)
+        if len(self.ans2label_dict) > 0:
+            self.constraint_trie = Trie(self.tokenizer.eos_token_id)
+            self.val_ans_l = []
+            self.val_masks_l = []
+            self.build_trie()
+            sg_args['constraint_trie'] = self.constraint_trie
+        self.model.to(self._device)
+        self.generator = sg.SequenceGenerator(**sg_args)
+        inference_d = {
+            'generation': self._text_gen_inference,
+            'traverse': self._traverse_inference,
+        }
+        self.task_inference_mapping = {
+            Tasks.image_captioning: self._text_gen_inference,
+            Tasks.summarization: self._text_gen_inference,
+            Tasks.visual_grounding: self._visual_grounding_inference,
+            Tasks.visual_entailment: inference_d[self.gen_type],
+            Tasks.visual_question_answering: inference_d[self.gen_type],
+            Tasks.text_classification: inference_d[self.gen_type],
+            Tasks.image_classification: inference_d[self.gen_type],
+        }
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        ret = self.task_inference_mapping[self.cfg.task](input)
+        ret['samples'] = input['samples']
+        for key in [
+                OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
+                OutputKeys.LABELS, OutputKeys.SCORES
+        ]:
+            if key in ret and len(ret[key]) == 1:
+                ret[key] = ret[key][0]
+            if key not in ret:
+                ret[key] = None
+        return ret
+
+    def postprocess(self, input: Dict[str, Tensor],
+                    **kwargs) -> Dict[str, Tensor]:
+        return input
+
+    def _text_gen_inference(self, input):
+        input = move_to_device(input, self._device)
+        gen_output = self.generator.generate([self.model], input)
+        gen = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
+        result = self.tokenizer.batch_decode(gen, skip_special_tokens=True)
+        # text generation tasks have no score
+        ret = {OFA_TASK_KEY_MAPPING[self.cfg.task]: result}
+        if self.cfg.task.endswith('classification'):
+            ret[OutputKeys.SCORES] = [1.0] * len(result)
+        return ret
+
+    def _visual_grounding_inference(self, input):
+        input = move_to_device(input, self._device)
+        gen_output = self.generator.generate([self.model], input)
+        tokens = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
+        region_coord_l = list()
+        for i in range(len(tokens)):
+            region_coord_l.append(tokens[i][:-1]
+                                  - len(self.tokenizer.get_vocab().items())
+                                  + self.cfg.num_bins)
+        region_tensor = torch.stack(region_coord_l, dim=0)
+        region_tensor = region_tensor / (
+            self.cfg.num_bins - 1) * self.cfg.model.get('max_image_size', 512)
+        region_tensor[:, ::2] /= input['w_resize_ratios']
+        region_tensor[:, 1::2] /= input['h_resize_ratios']
+        return {
+            OutputKeys.BOXES: move_to_device(region_tensor,
+                                             torch.device('cpu')),
+            OutputKeys.SCORES: [1.0] * region_tensor.shape[0]
+        }
+
+    def _traverse_inference(self, input):
+        input = move_to_device(input, self._device)
+        encoder_input = dict()
+        for key in input['net_input'].keys():
+            encoder_input[key] = input['net_input'][key]
+        encoder_out = self.model.encoder(**encoder_input)
+        valid_result = []
+        for val_ans, val_masks in zip(self.val_ans_l, self.val_masks_l):
+            valid_size = len(val_ans)
+            valid_tgt_items = [
+                torch.cat([
+                    torch.tensor(decoder_prompt[1:]), valid_answer,
+                    self.eos_item
+                ]) for decoder_prompt in input['decoder_prompts']
+                for valid_answer in val_ans
+            ]
+            valid_prev_items = [
+                torch.cat([torch.tensor(decoder_prompt), valid_answer])
+                for decoder_prompt in input['decoder_prompts']
+                for valid_answer in val_ans
+            ]
+            valid_constraint_mask_items = [
+                torch.cat([
+                    torch.zeros(
+                        len(decoder_prompt) - 1,
+                        valid_constraint_mask.size(1)).bool().to(self._device),
+                    valid_constraint_mask], dim=0)  # yapf: disable
+                for decoder_prompt in input['decoder_prompts']  # yapf: disable
+                for valid_constraint_mask in val_masks]  # yapf: disable
+            valid_tgt = collate_tokens(
+                valid_tgt_items,
+                pad_idx=self.tokenizer.pad_token_id).to(self._device)
+            valid_prev_output = collate_tokens(
+                valid_prev_items,
+                pad_idx=self.tokenizer.pad_token_id).to(self._device)
+            val_masks = collate_tokens(
+                valid_constraint_mask_items,
+                pad_idx=self.tokenizer.pad_token_id).to(self._device)
+            new_encoder_out = {
+                'last_hidden_state':
+                encoder_out['last_hidden_state'].repeat_interleave(
+                    valid_size, dim=0),
+                'padding_mask':
+                encoder_out['padding_mask'].repeat_interleave(
+                    valid_size, dim=0),
+                'position_embedding':
+                encoder_out['position_embedding'].repeat_interleave(
+                    valid_size, dim=0)
+            }
+            encoder_attention_mask = expand_mask(
+                new_encoder_out['padding_mask'],
+                new_encoder_out['last_hidden_state'].dtype,
+                valid_prev_output.shape[-1])
+
+            decoder_out = self.model.decoder(
+                valid_prev_output,
+                encoder_hidden_states=new_encoder_out['last_hidden_state'],
+                encoder_attention_mask=encoder_attention_mask,
+                src_pos_embed=new_encoder_out['position_embedding'])
+
+            decoder_out[0].masked_fill_(~val_masks, -math.inf)
+            lprobs = self.model.get_normalized_probs(
+                decoder_out, log_probs=True)
+            scores = lprobs.gather(
+                dim=-1, index=valid_tgt.unsqueeze(-1)).squeeze(-1)
+            scores = scores.masked_fill(
+                valid_tgt.eq(self.tokenizer.pad_token_id), 0)
+            scores = scores.masked_fill((~val_masks).all(2), 0)
+            scores = scores.sum(1)
+            scores = scores.view(-1, valid_size)
+            valid_result.append(scores)
+        valid_result = torch.cat(valid_result, dim=-1)
+        predicts = valid_result.argmax(1).tolist()
+        probs = F.softmax(valid_result, dim=-1)
+        hyps = [self.index2ans[predict_index] for predict_index in predicts]
+        scores = [
+            float(prob[idx].cpu().detach().numpy())
+            for prob, idx in zip(probs, predicts)
+        ]
+        return {OutputKeys.LABELS: hyps, OutputKeys.SCORES: scores}
+
+    def build_trie(self):
+        answer_item_list = []
+
+        for i, answer in enumerate(self.ans2label_dict.keys()):
+            answer_item = self.tokenizer(
+                ' ' + answer, return_tensors='pt',
+                add_special_tokens=False).input_ids.squeeze(0)
+            answer_item_list.append(answer_item)
+            self.index2ans[i] = answer
+            self.constraint_trie.insert([self.tokenizer.bos_token_id]
+                                        + answer_item.tolist()
+                                        + [self.tokenizer.eos_token_id])
+
+        constraint_mask_list = []
+        for answer_item in answer_item_list:
+            constraint_mask = torch.zeros(
+                (len(answer_item) + 1,
+                 len(self.tokenizer.get_vocab()))).bool()
+            for i in range(len(answer_item) + 1):
+                constraint_prefix_token = [self.tokenizer.bos_token_id
+                                           ] + answer_item[:i].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(
+                    constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            constraint_mask_list.append(constraint_mask)
+
+        for i in range(0, len(answer_item_list), self.val_batch_size):
+            self.val_ans_l += [answer_item_list[i:i + self.val_batch_size]]
+            self.val_masks_l += [
+                constraint_mask_list[i:i + self.val_batch_size]
+            ]
+        self.val_ans_l = move_to_device(self.val_ans_l, self._device)
+        self.val_masks_l = move_to_device(self.val_masks_l, self._device)
+
+    def load_ans2label(self):
+        if self.cfg.model.get('answer2label', None):
+            filename = osp.join(self.model_dir, self.cfg.model.answer2label)
+            self.ans2label_dict = json.load(open(filename))
--- a/modelscope/models/multi_modal/ofa_for_image_captioning_model.py
+++ b/modelscope/models/multi_modal/ofa_for_image_captioning_model.py
@@ -1,53 +0,0 @@
-from typing import Any, Dict
-
-import torch.cuda
-
-from modelscope.metainfo import Models
-from modelscope.models.base import Model
-from modelscope.models.builder import MODELS
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-from .ofa import OFAModel, OFATokenizer
-from .ofa.generate import sequence_generator as sg
-from .ofa.generate.utils import move_to_device
-
-__all__ = ['OfaForImageCaptioning']
-
-
-@MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
-class OfaForImageCaptioning(Model):
-
-    def __init__(self, model_dir, *args, **kwargs):
-        super().__init__(model_dir=model_dir, *args, **kwargs)
-        model = OFAModel.from_pretrained(model_dir)
-
-        self.model = model.module if hasattr(model, 'module') else model
-        self.tokenizer = OFATokenizer.from_pretrained(model_dir)
-        self.tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)])
-        self.tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)])
-        self._device = torch.device('cuda') if torch.cuda.is_available() \
-            else torch.device('cpu')
-        self.model.to(self._device)
-        # Initialize generator
-        sg_args = {
-            'tokenizer': self.tokenizer,
-            'beam_size': 5,
-            'max_len_b': 16,
-            'min_len': 1,
-            'no_repeat_ngram_size': 3,
-            'constraint_range': None
-        }
-        if hasattr(kwargs, 'beam_search'):
-            sg_args.update(kwargs['beam_search'])
-        self.generator = sg.SequenceGenerator(**sg_args)
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        input = move_to_device(input, self._device)
-        gen_output = self.generator.generate([self.model], input)
-        gen = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
-        result = self.tokenizer.batch_decode(gen, skip_special_tokens=True)
-        return {'image_id': '42', OutputKeys.CAPTION: result[0]}
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        # What should we do here ?
-        return inputs
--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -24,6 +24,7 @@ if TYPE_CHECKING:
    from .ocr_detection_pipeline import OCRDetectionPipeline
    from .video_category_pipeline import VideoCategoryPipeline
    from .virtual_tryon_pipeline import VirtualTryonPipeline
+    from .image_classification_pipeline import ImageClassificationPipeline
 else:
    _import_structure = {
        'action_recognition_pipeline': ['ActionRecognitionPipeline'],
@@ -33,7 +34,7 @@ else:
        'face_image_generation_pipeline': ['FaceImageGenerationPipeline'],
        'face_recognition_pipeline': ['FaceRecognitionPipeline'],
        'image_classification_pipeline':
-        ['GeneralImageClassificationPipeline'],
+        ['GeneralImageClassificationPipeline', 'ImageClassificationPipeline'],
        'image_cartoon_pipeline': ['ImageCartoonPipeline'],
        'image_denoise_pipeline': ['ImageDenoisePipeline'],
        'image_color_enhance_pipeline': ['ImageColorEnhancePipeline'],
--- a/modelscope/pipelines/cv/image_classification_pipeline.py
+++ b/modelscope/pipelines/cv/image_classification_pipeline.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union

 import cv2
 import numpy as np
@@ -7,16 +8,41 @@ import torch

 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Input
-from modelscope.preprocessors import load_image
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import OfaPreprocessor, Preprocessor, load_image
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
-from ..base import Pipeline
-from ..builder import PIPELINES

 logger = get_logger()


+@PIPELINES.register_module(
+    Tasks.image_classification, module_name=Pipelines.image_classification)
+class ImageClassificationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: [Preprocessor] = None,
+                 **kwargs):
+        super().__init__(model=model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or OfaForAllTasks'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        pipe_model.model.eval()
+        if preprocessor is None and pipe_model:
+            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
+
+
@PIPELINES.register_module(
    Tasks.image_classification_imagenet,
    module_name=Pipelines.general_image_classification)
@@ -27,7 +53,7 @@ class GeneralImageClassificationPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
-        use `model` and `preprocessor` to create a kws pipeline for prediction
+        use `model` and `preprocessor` to create a image classification pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/multi_modal/init.py
+++ b/modelscope/pipelines/multi_modal/init.py
@@ -5,7 +5,9 @@ from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .generative_multi_modal_embedding_pipeline import GEMMMultiModalEmbeddingPipeline
-    from .image_captioning_pipeline import ImageCaptionPipeline
+    from .image_captioning_pipeline import ImageCaptioningPipeline
+    from .visual_entailment_pipeline import VisualEntailmentPipeline
+    from .visual_grounding_pipeline import VisualGroundingPipeline
    from .multi_modal_embedding_pipeline import MultiModalEmbeddingPipeline
    from .text_to_image_synthesis_pipeline import TextToImageSynthesisPipeline
    from .video_multi_modal_embedding_pipeline import \
@@ -14,7 +16,9 @@ if TYPE_CHECKING:

 else:
    _import_structure = {
-        'image_captioning_pipeline': ['ImageCaptionPipeline'],
+        'image_captioning_pipeline': ['ImageCaptioningPipeline'],
+        'visual_entailment_pipeline': ['VisualEntailmentPipeline'],
+        'visual_grounding_pipeline': ['VisualGroundingPipeline'],
        'multi_modal_embedding_pipeline': ['MultiModalEmbeddingPipeline'],
        'text_to_image_synthesis_pipeline': ['TextToImageSynthesisPipeline'],
        'visual_question_answering_pipeline':
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -1,9 +1,10 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union

 from modelscope.metainfo import Pipelines
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import OfaImageCaptionPreprocessor, Preprocessor
+from modelscope.preprocessors import OfaPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger

@@ -12,28 +13,29 @@ logger = get_logger()

@PIPELINES.register_module(
    Tasks.image_captioning, module_name=Pipelines.image_captioning)
-class ImageCaptionPipeline(Pipeline):
+class ImageCaptioningPipeline(Pipeline):

    def __init__(self,
                 model: Union[Model, str],
                 preprocessor: Optional[Preprocessor] = None,
                 **kwargs):
        """
-        use `model` and `preprocessor` to create a kws pipeline for prediction
+        use `model` and `preprocessor` to create a image captioning pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model)
        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or OfaForImageCaptioning'
+            'model must be a single str or OfaForAllTasks'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
+        pipe_model.model.eval()
        if preprocessor is None and pipe_model:
-            preprocessor = OfaImageCaptionPreprocessor(model_dir=model)
+            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
--- a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import OfaPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.visual_entailment, module_name=Pipelines.visual_entailment)
+class VisualEntailmentPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: [Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a visual entailment pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or OfaForAllTasks'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        pipe_model.model.eval()
+        if preprocessor is None and pipe_model:
+            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
--- a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import OfaPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.visual_grounding, module_name=Pipelines.visual_grounding)
+class VisualGroundingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: [Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a visual grounding pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or OfaForAllTasks'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        pipe_model.model.eval()
+        if preprocessor is None and pipe_model:
+            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
--- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union

 import torch
@@ -30,15 +31,18 @@ class VisualQuestionAnsweringPipeline(Pipeline):
            model (MPlugForVisualQuestionAnswering): a model instance
            preprocessor (MPlugVisualQuestionAnsweringPreprocessor): a preprocessor instance
        """
-        model = model if isinstance(
-            model,
-            MPlugForVisualQuestionAnswering) else Model.from_pretrained(model)
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        self.tokenizer = None
        if preprocessor is None:
            preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
                model.model_dir)
-        model.eval()
+        if isinstance(model, MPlugForVisualQuestionAnswering):
+            model.eval()
+            self.tokenizer = model.tokenizer
+        else:
+            model.model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.tokenizer = model.tokenizer

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
@@ -55,6 +59,8 @@ class VisualQuestionAnsweringPipeline(Pipeline):
        Returns:
            Dict[str, str]: the prediction results
        """
+        if self.tokenizer is None:
+            return inputs
        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
--- a/modelscope/pipelines/nlp/init.py
+++ b/modelscope/pipelines/nlp/init.py
@@ -17,6 +17,8 @@ if TYPE_CHECKING:
    from .translation_pipeline import TranslationPipeline
    from .word_segmentation_pipeline import WordSegmentationPipeline
    from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
+    from .summarization_pipeline import SummarizationPipeline
+    from .text_classification_pipeline import TextClassificationPipeline
    from .text_error_correction_pipeline import TextErrorCorrectionPipeline

 else:
@@ -38,6 +40,8 @@ else:
        'named_entity_recognition_pipeline':
        ['NamedEntityRecognitionPipeline'],
        'translation_pipeline': ['TranslationPipeline'],
+        'summarization_pipeline': ['SummarizationPipeline'],
+        'text_classification_pipeline': ['TextClassificationPipeline'],
        'text_error_correction_pipeline': ['TextErrorCorrectionPipeline']
    }

--- a/modelscope/pipelines/nlp/summarization_pipeline.py
+++ b/modelscope/pipelines/nlp/summarization_pipeline.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import OfaPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.summarization, module_name=Pipelines.text_generation)
+class SummarizationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: [Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a kws pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or OfaForAllTasks'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        pipe_model.model.eval()
+        if preprocessor is None and pipe_model:
+            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
--- a/modelscope/pipelines/nlp/text_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/text_classification_pipeline.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import OfaPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.text_classification)
+class TextClassificationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: [Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a kws pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or OfaForAllTasks'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        pipe_model.model.eval()
+        if preprocessor is None and pipe_model:
+            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -14,7 +14,7 @@ if TYPE_CHECKING:
                        ImageInstanceSegmentationPreprocessor,
                        ImageDenoisePreprocessor)
    from .kws import WavToLists
-    from .multi_modal import (OfaImageCaptionPreprocessor,
+    from .multi_modal import (OfaPreprocessor,
                              MPlugVisualQuestionAnsweringPreprocessor)
    from .nlp import (Tokenize, SequenceClassificationPreprocessor,
                      TextGenerationPreprocessor,
@@ -41,10 +41,8 @@ else:
            'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor'
        ],
        'kws': ['WavToLists'],
-        'multi_modal': [
-            'OfaImageCaptionPreprocessor',
-            'MPlugVisualQuestionAnsweringPreprocessor'
-        ],
+        'multi_modal':
+        ['OfaPreprocessor', 'MPlugVisualQuestionAnsweringPreprocessor'],
        'nlp': [
            'Tokenize', 'SequenceClassificationPreprocessor',
            'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -4,26 +4,25 @@ from typing import Any, Dict, Union

 import torch
 from PIL import Image
-from torchvision import transforms

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
-from modelscope.models.multi_modal.ofa import OFATokenizer
-from modelscope.utils.constant import Fields
-from modelscope.utils.type_assert import type_assert
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModelFile, Tasks
 from .base import Preprocessor
 from .builder import PREPROCESSORS
-from .image import load_image
+from .ofa import *  # noqa
+from .ofa.utils.collate import collate_fn

 __all__ = [
-    'OfaImageCaptionPreprocessor',
+    'OfaPreprocessor',
    'MPlugVisualQuestionAnsweringPreprocessor',
 ]


@PREPROCESSORS.register_module(
    Fields.multi_modal, module_name=Preprocessors.ofa_image_caption)
-class OfaImageCaptionPreprocessor(Preprocessor):
+class OfaPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
@@ -32,41 +31,28 @@ class OfaImageCaptionPreprocessor(Preprocessor):
            model_dir (str): model path
        """
        super().__init__(*args, **kwargs)
+        preprocess_mapping = {
+            Tasks.image_captioning: OfaImageCaptioningPreprocessor,
+            Tasks.visual_grounding: OfaVisualGroundingPreprocessor,
+            Tasks.visual_question_answering:
+            OfaVisualQuestionAnsweringPreprocessor,
+            Tasks.visual_entailment: OfaVisualEntailmentPreprocessor,
+            Tasks.image_classification: OfaImageClassificationPreprocessor,
+            Tasks.text_classification: OfaTextClassificationPreprocessor,
+            Tasks.summarization: OfaSummarizationPreprocessor
+        }
        model_dir = model_dir if osp.exists(model_dir) else snapshot_download(
            model_dir)
-        self.tokenizer = OFATokenizer.from_pretrained(model_dir)
-        self.tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)])
-        self.tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)])
+        cfg = Config.from_file(osp.join(model_dir, ModelFile.CONFIGURATION))
+        self.preprocess = preprocess_mapping[cfg.task](cfg, model_dir)
+        self.tokenizer = self.preprocess.tokenizer

-        # Initialize transform
-        mean = [0.5, 0.5, 0.5]
-        std = [0.5, 0.5, 0.5]
-        patch_image_size = 480
-        self.patch_resize_transform = transforms.Compose([
-            lambda image: image.convert('RGB'),
-            transforms.Resize((patch_image_size, patch_image_size),
-                              interpolation=Image.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=mean, std=std),
-        ])
-
-    @type_assert(object, (str, tuple, Image.Image))
-    def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
-        if isinstance(data, Image.Image):
-            patch_image = self.patch_resize_transform(data).unsqueeze(0)
-        else:
-            patch_image = self.patch_resize_transform(
-                load_image(data)).unsqueeze(0)
-        text = ' what does the image describe?'
-        inputs = self.tokenizer([text], max_length=1024,
-                                return_tensors='pt')['input_ids']
-        sample = dict()
-        sample['net_input'] = {
-            'input_ids': inputs,
-            'patch_images': patch_image,
-            'patch_masks': torch.tensor([True])
-        }
-        return sample
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        sample = self.preprocess(data)
+        sample['sample'] = data
+        return collate_fn([sample],
+                          pad_idx=self.tokenizer.pad_token_id,
+                          eos_idx=self.tokenizer.eos_token_id)


@PREPROCESSORS.register_module(
--- a/modelscope/preprocessors/ofa/init.py
+++ b/modelscope/preprocessors/ofa/init.py
@@ -0,0 +1,8 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .image_captioning import OfaImageCaptioningPreprocessor
+from .image_classification import OfaImageClassificationPreprocessor
+from .summarization import OfaSummarizationPreprocessor
+from .text_classification import OfaTextClassificationPreprocessor
+from .visual_entailment import OfaVisualEntailmentPreprocessor
+from .visual_grounding import OfaVisualGroundingPreprocessor
+from .visual_question_answering import OfaVisualQuestionAnsweringPreprocessor
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -0,0 +1,117 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import re
+from os import path as osp
+
+import json
+import numpy as np
+import torch
+
+from modelscope.models.multi_modal.ofa import OFATokenizer
+from modelscope.utils.trie import Trie
+from .utils.random_help import set_torch_seed
+
+
+class OfaBasePreprocessor:
+
+    def __init__(self, cfg, model_dir):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path
+        """
+        self.cfg = cfg
+        tokenizer = OFATokenizer.from_pretrained(model_dir)
+        tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)])
+        tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)])
+        self.tokenizer = tokenizer
+        self.bos_item = torch.LongTensor([tokenizer.bos_token_id])
+        self.pad_item = torch.LongTensor([tokenizer.pad_token_id])
+        self.eos_item = torch.LongTensor([tokenizer.eos_token_id])
+        self.tgt_dict = self.src_dict = {
+            value: key
+            for key, value in tokenizer.get_vocab().items()
+        }
+        self.max_src_length = cfg.model.get('max_src_length', 256)
+        self.max_image_size = cfg.model.get('max_image_size', 512)
+        self.language = self.cfg.model.get('language', 'en')
+        self.prompt_type = self.cfg.model.get('prompt_type', 'none')
+        seed = self.cfg.model.get('seed', 7)
+        np.random.seed(seed)
+        set_torch_seed(seed)
+        imagenet_default_mean_and_std = self.cfg.model.get(
+            'imagenet_default_mean_and_std', False)
+        if imagenet_default_mean_and_std:
+            self.mean = [0.485, 0.456, 0.406]
+            self.std = [0.229, 0.224, 0.225]
+        else:
+            self.mean = [0.5, 0.5, 0.5]
+            self.std = [0.5, 0.5, 0.5]
+        self.patch_image_size = self.cfg.model.get('patch_image_size', 480)
+        self.constraint_trie = None
+        self.index2ans = {}
+        if self.cfg.model.get('answer2label', False):
+            ans2label_file = osp.join(model_dir, self.cfg.model.answer2label)
+            ans2label_dict = json.load(open(ans2label_file, 'r'))
+            self.constraint_trie = Trie(tokenizer.eos_token_id)
+            for i, answer in enumerate(ans2label_dict.keys()):
+                answer_item = tokenizer(
+                    ' ' + answer,
+                    return_tensors='pt',
+                    add_special_tokens=False).input_ids.squeeze(0)
+                self.constraint_trie.insert([tokenizer.bos_token_id]
+                                            + answer_item.tolist()
+                                            + [tokenizer.eos_token_id])
+
+    def get_inputs(self, text, add_bos=True, add_eos=True):
+        inputs = self.tokenizer(
+            text,
+            max_length=self.max_src_length,
+            add_special_tokens=False,
+            return_tensors='pt')['input_ids'].squeeze(0)
+        if add_bos:
+            inputs = torch.cat([self.bos_item, inputs])
+        if add_eos:
+            inputs = torch.cat([inputs, self.eos_item])
+        return inputs
+
+    @staticmethod
+    def pre_caption(caption, max_words=None):
+        caption = caption.lower().lstrip(',.!?*#:;~').replace('-', ' ')\
+            .replace('/', ' ').replace('<person>', 'person')
+
+        caption = re.sub(
+            r'\s{2,}',
+            ' ',
+            caption,
+        )
+        caption = caption.rstrip('\n')
+        caption = caption.strip(' ')
+
+        # truncate caption
+        caption_words = caption.split(' ')
+        if max_words is not None and len(caption_words) > max_words:
+            caption = ' '.join(caption_words[:max_words])
+
+        return caption
+
+    @staticmethod
+    def pre_question(question, max_ques_words):
+        question = question.lower().lstrip(',.!?*#:;~').replace('-',
+                                                                ' ').replace(
+                                                                    '/', ' ')
+
+        question = re.sub(
+            r'\s{2,}',
+            ' ',
+            question,
+        )
+        question = question.rstrip('\n')
+        question = question.strip(' ')
+
+        # truncate question
+        question_words = question.split(' ')
+        if len(question_words) > max_ques_words:
+            question = ' '.join(question_words[:max_ques_words])
+
+        return question
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.preprocessors.image import load_image
+from .base import OfaBasePreprocessor
+
+
+class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
+
+    def __init__(self, cfg, model_dir):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path
+        """
+        super(OfaImageCaptioningPreprocessor, self).__init__(cfg, model_dir)
+        # Initialize transform
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: image.convert('RGB'),
+            transforms.Resize((self.patch_image_size, self.patch_image_size),
+                              interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=self.mean, std=self.std),
+        ])
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = data['image'] if isinstance(
+            data['image'], Image.Image) else load_image(data['image'])
+        patch_image = self.patch_resize_transform(image)
+        prompt = self.cfg.model.get('prompt', ' what does the image describe?')
+        inputs = self.get_inputs(prompt)
+        sample = {
+            'source': inputs,
+            'patch_image': patch_image,
+            'patch_mask': torch.tensor([True])
+        }
+        return sample
--- a/modelscope/preprocessors/ofa/image_classification.py
+++ b/modelscope/preprocessors/ofa/image_classification.py
@@ -0,0 +1,43 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.preprocessors.image import load_image
+from .base import OfaBasePreprocessor
+
+
+class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
+
+    def __init__(self, cfg, model_dir):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path
+        """
+        super(OfaImageClassificationPreprocessor,
+              self).__init__(cfg, model_dir)
+        # Initialize transform
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: image.convert('RGB'),
+            transforms.Resize((self.patch_image_size, self.patch_image_size),
+                              interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=self.mean, std=self.std),
+        ])
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = data['image'] if isinstance(
+            data['image'], Image.Image) else load_image(data['image'])
+        patch_image = self.patch_resize_transform(image)
+        prompt = self.cfg.model.get('prompt', ' what does the image describe?')
+        inputs = self.get_inputs(prompt)
+        sample = {
+            'source': inputs,
+            'patch_image': patch_image,
+            'patch_mask': torch.tensor([True])
+        }
+        return sample
--- a/modelscope/preprocessors/ofa/summarization.py
+++ b/modelscope/preprocessors/ofa/summarization.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+from .base import OfaBasePreprocessor
+
+
+class OfaSummarizationPreprocessor(OfaBasePreprocessor):
+
+    def __init__(self, cfg, model_dir):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path
+        """
+        super(OfaSummarizationPreprocessor, self).__init__(cfg, model_dir)
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        source = super().pre_caption(
+            data['text'], max_words=self.max_src_length)
+        source = source.strip()[:self.max_src_length]
+        source = source.replace('[unk]', 'unk').replace('<unk>', 'unk')
+        prompt = self.cfg.model.get(
+            'prompt', ' " {} " Summarize the article with a title: ')
+        text = prompt.format(source)
+        inputs = self.get_inputs(text)
+        if self.prompt_type == 'none':
+            decoder_prompt = self.bos_item
+        elif self.prompt_type == 'prev_output':
+            decoder_prompt = inputs[:-1]
+        else:
+            raise NotImplementedError
+        sample = {
+            'source': inputs,
+            'decoder_prompt': decoder_prompt,
+        }
+        return sample
--- a/modelscope/preprocessors/ofa/text_classification.py
+++ b/modelscope/preprocessors/ofa/text_classification.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+from .base import OfaBasePreprocessor
+
+
+class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
+
+    def __init__(self, cfg, model_dir):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path
+        """
+        super(OfaTextClassificationPreprocessor, self).__init__(cfg, model_dir)
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        text1 = ' '.join(
+            data['text'].lower().strip().split()[:self.max_src_length])
+        text2 = ' '.join(
+            data['text2'].lower().strip().split()[:self.max_src_length])
+        prompt = ' can text1 " {} " imply text2 " {} "?'
+        text = prompt.format(text1, text2)
+        inputs = self.get_inputs(text)
+        if self.prompt_type == 'none':
+            decoder_prompt = self.bos_item
+        elif self.prompt_type == 'src':
+            decoder_prompt = inputs
+        elif self.prompt_type == 'prev_output':
+            decoder_prompt = inputs[:-1]
+        else:
+            raise NotImplementedError
+        sample = {
+            'source': inputs,
+            'decoder_prompt': decoder_prompt,
+        }
+        return sample
--- a/modelscope/preprocessors/ofa/utils/init.py
+++ b/modelscope/preprocessors/ofa/utils/init.py
--- a/modelscope/preprocessors/ofa/utils/collate.py
+++ b/modelscope/preprocessors/ofa/utils/collate.py
@@ -0,0 +1,109 @@
+import numpy as np
+import torch
+
+
+def collate_fn(samples, pad_idx, eos_idx):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key):
+        return collate_tokens([s[key] for s in samples],
+                              pad_idx,
+                              eos_idx=eos_idx)
+
+    src_tokens = merge('source')
+
+    batch = {
+        'nsentences': len(samples),
+        'net_input': {
+            'input_ids': src_tokens,
+        },
+    }
+    if samples[0].get('id', None) is not None:
+        batch['id'] = np.array([s.get['id'] for s in samples])
+    if samples[0].get('target', None) is not None:
+        batch['target'] = merge('target')
+        tgt_lengths = torch.LongTensor(
+            [s['target'].ne(pad_idx).long().sum() for s in samples])
+        ntokens = tgt_lengths.sum().item()
+        batch['ntokens'] = ntokens
+    if samples[0].get('prev_output_tokens', None) is not None:
+        batch['net_input']['decoder_input_ids'] = merge('prev_output_tokens')
+    if samples[0].get('patch_image', None) is not None:
+        batch['net_input']['patch_images'] = torch.stack(
+            [sample['patch_image'] for sample in samples], dim=0)
+    if samples[0].get('patch_mask', None) is not None:
+        batch['net_input']['patch_masks'] = torch.cat(
+            [sample['patch_mask'] for sample in samples])
+    # image generation
+    if samples[0].get('code_mask', None) is not None:
+        batch['net_input']['code_masks'] = torch.cat(
+            [sample['code_mask'] for sample in samples])
+    if samples[0].get('code_image', None) is not None:
+        batch['code_images'] = torch.cat(
+            [sample['code_image'] for sample in samples])
+    # For classification tasks (i.e., VQA, SNLI-VE, GLUE)
+    if samples[0].get('conf', None) is not None:
+        batch['conf'] = torch.cat([s['conf'] for s in samples], dim=0)
+    if samples[0].get('ref_dict', None) is not None:
+        batch['ref_dict'] = np.array([s['ref_dict'] for s in samples])
+    if samples[0].get('constraint_mask', None) is not None:
+        batch['constraint_masks'] = merge('constraint_mask')
+    if samples[0].get('decoder_prompt', None) is not None:
+        batch['decoder_prompts'] = np.array(
+            [s['decoder_prompt'].tolist() for s in samples])
+    # For detection and visual grounding
+    if samples[0].get('w_resize_ratio', None) is not None:
+        batch['w_resize_ratios'] = torch.stack(
+            [s['w_resize_ratio'] for s in samples], dim=0)
+    if samples[0].get('h_resize_ratio', None) is not None:
+        batch['h_resize_ratios'] = torch.stack(
+            [s['h_resize_ratio'] for s in samples], dim=0)
+    if samples[0].get('region_coord', None) is not None:
+        batch['region_coords'] = torch.stack(
+            [s['region_coord'] for s in samples], dim=0)
+    if samples[0].get('sample', None) is not None:
+        batch['samples'] = [s['sample'] for s in samples]
+    return batch
+
+
+def collate_tokens(
+    values,
+    pad_idx,
+    eos_idx=None,
+    left_pad=False,
+    move_eos_to_beginning=False,
+    pad_to_length=None,
+    pad_to_multiple=1,
+    pad_to_bsz=None,
+):
+    """Convert a list of 1d tensors into a padded 2d tensor."""
+    size = max(v.size(0) for v in values)
+    size = size if pad_to_length is None else max(size, pad_to_length)
+    if pad_to_multiple != 1 and size % pad_to_multiple != 0:
+        size = int(((size - 0.1) // pad_to_multiple + 1) * pad_to_multiple)
+
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel()
+        if move_eos_to_beginning:
+            if eos_idx is None:
+                # if no eos_idx is specified, then use the last token in src
+                dst[0] = src[-1]
+            else:
+                dst[0] = eos_idx
+            dst[1:] = src[:-1]
+        else:
+            dst.copy_(src)
+
+    if values[0].dim() == 1:
+        res = values[0].new(len(values), size).fill_(pad_idx)
+    elif values[0].dim() == 2:
+        assert move_eos_to_beginning is False
+        res = values[0].new(len(values), size,
+                            values[0].size(1)).fill_(pad_idx)
+    else:
+        raise NotImplementedError
+
+    for i, v in enumerate(values):
+        copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
+    return res
--- a/modelscope/preprocessors/ofa/utils/random_help.py
+++ b/modelscope/preprocessors/ofa/utils/random_help.py
@@ -0,0 +1,42 @@
+import torch
+
+try:
+    import torch_xla.core.xla_model as xm
+except ImportError:
+    xm = None
+
+
+def get_rng_state():
+    state = {'torch_rng_state': torch.get_rng_state()}
+    if xm is not None:
+        state['xla_rng_state'] = xm.get_rng_state()
+    if torch.cuda.is_available():
+        state['cuda_rng_state'] = torch.cuda.get_rng_state()
+    return state
+
+
+def set_rng_state(state):
+    torch.set_rng_state(state['torch_rng_state'])
+    if xm is not None:
+        xm.set_rng_state(state['xla_rng_state'])
+    if torch.cuda.is_available():
+        torch.cuda.set_rng_state(state['cuda_rng_state'])
+
+
+class set_torch_seed(object):
+
+    def __init__(self, seed):
+        assert isinstance(seed, int)
+        self.rng_state = get_rng_state()
+
+        torch.manual_seed(seed)
+        if xm is not None:
+            xm.set_rng_state(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        set_rng_state(self.rng_state)
--- a/modelscope/preprocessors/ofa/utils/transforms.py
+++ b/modelscope/preprocessors/ofa/utils/transforms.py
@@ -0,0 +1,557 @@
+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+
+import random
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from PIL import Image
+
+
+def crop(image, target, region, delete=True):
+    cropped_image = F.crop(image, *region)
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target['size'] = torch.tensor([h, w])
+
+    fields = ['labels', 'area']
+
+    if 'boxes' in target:
+        boxes = target['boxes']
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target['boxes'] = cropped_boxes.reshape(-1, 4)
+        target['area'] = area
+        fields.append('boxes')
+
+    if 'polygons' in target:
+        polygons = target['polygons']
+        num_polygons = polygons.shape[0]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        start_coord = torch.cat([
+            torch.tensor([j, i], dtype=torch.float32)
+            for _ in range(polygons.shape[1] // 2)], dim=0)  # yapf: disable#
+        cropped_boxes = polygons - start_coord
+        cropped_boxes = torch.min(
+            cropped_boxes.reshape(num_polygons, -1, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        target['polygons'] = cropped_boxes.reshape(num_polygons, -1)
+        fields.append('polygons')
+
+    if 'masks' in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append('masks')
+
+    # remove elements for which the boxes or masks that have zero area
+    if delete and ('boxes' in target or 'masks' in target):
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if 'boxes' in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(
+                cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+
+        for field in fields:
+            target[field] = target[field][keep.tolist()]
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+    w, h = image.size
+    target = target.copy()
+    if 'boxes' in target:
+        boxes = target['boxes']
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor(
+            [-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target['boxes'] = boxes
+
+    if 'polygons' in target:
+        polygons = target['polygons']
+        num_polygons = polygons.shape[0]
+        polygons = polygons.reshape(num_polygons, -1, 2) * torch.as_tensor(
+            [-1, 1]) + torch.as_tensor([w, 0])
+        target['polygons'] = polygons
+
+    if 'masks' in target:
+        target['masks'] = target['masks'].flip(-1)
+
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            if max_size is not None:
+                max_size = int(max_size)
+                h = min(h, max_size)
+                w = min(w, max_size)
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        if max_size is not None:
+            max_size = int(max_size)
+            oh = min(oh, max_size)
+            ow = min(ow, max_size)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size, interpolation=Image.BICUBIC)
+
+    if target is None:
+        return rescaled_image
+
+    ratios = tuple(
+        float(s) / float(s_orig)
+        for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if 'boxes' in target:
+        boxes = target['boxes']
+        scaled_boxes = boxes * torch.as_tensor(
+            [ratio_width, ratio_height, ratio_width, ratio_height])
+        target['boxes'] = scaled_boxes
+
+    if 'polygons' in target:
+        polygons = target['polygons']
+        scaled_ratio = torch.cat([
+            torch.tensor([ratio_width, ratio_height])
+            for _ in range(polygons.shape[1] // 2)], dim=0)  # yapf: disable
+        scaled_polygons = polygons * scaled_ratio
+        target['polygons'] = scaled_polygons
+
+    if 'area' in target:
+        area = target['area']
+        scaled_area = area * (ratio_width * ratio_height)
+        target['area'] = scaled_area
+
+    h, w = size
+    target['size'] = torch.tensor([h, w])
+
+    if 'masks' in target:
+        assert False
+
+    return rescaled_image, target
+
+
+class CenterCrop(object):
+
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target,
+                    (crop_top, crop_left, crop_height, crop_width))
+
+
+class ObjectCenterCrop(object):
+
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+
+        x0 = float(target['boxes'][0][0])
+        y0 = float(target['boxes'][0][1])
+        x1 = float(target['boxes'][0][2])
+        y1 = float(target['boxes'][0][3])
+
+        center_x = (x0 + x1) / 2
+        center_y = (y0 + y1) / 2
+        crop_left = max(
+            center_x - crop_width / 2
+            + min(image_width - center_x - crop_width / 2, 0), 0)
+        crop_top = max(
+            center_y - crop_height / 2
+            + min(image_height - center_y - crop_height / 2, 0), 0)
+
+        return crop(
+            img,
+            target, (crop_top, crop_left, crop_height, crop_width),
+            delete=False)
+
+
+class RandomHorizontalFlip(object):
+
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+
+
+class RandomResize(object):
+
+    def __init__(self, sizes, max_size=None, equal=False):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+        self.equal = equal
+
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        if self.equal:
+            return resize(img, target, size, size)
+        else:
+            return resize(img, target, size, self.max_size)
+
+
+class ToTensor(object):
+
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+
+
+class Normalize(object):
+
+    def __init__(self, mean, std, max_image_size=512):
+        self.mean = mean
+        self.std = std
+        self.max_image_size = max_image_size
+
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        # h, w = image.shape[-2:]
+        h, w = target['size'][0], target['size'][1]
+        if 'boxes' in target:
+            boxes = target['boxes']
+            boxes = boxes / self.max_image_size
+            target['boxes'] = boxes
+        if 'polygons' in target:
+            polygons = target['polygons']
+            scale = torch.cat([
+                torch.tensor([w, h], dtype=torch.float32)
+                for _ in range(polygons.shape[1] // 2)], dim=0)  # yapf: disable
+            polygons = polygons / scale
+            target['polygons'] = polygons
+        return image, target
+
+
+class Compose(object):
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+
+
+class LargeScaleJitter(object):
+    """
+        implementation of large scale jitter from copy_paste
+    """
+
+    def __init__(self, output_size=512, aug_scale_min=0.3, aug_scale_max=2.0):
+        self.desired_size = torch.tensor([output_size])
+        self.aug_scale_min = aug_scale_min
+        self.aug_scale_max = aug_scale_max
+
+    def rescale_target(self, scaled_size, image_size, target):
+        # compute rescaled targets
+        image_scale = scaled_size / image_size
+        ratio_height, ratio_width = image_scale
+
+        target = target.copy()
+        target['size'] = scaled_size
+
+        if 'boxes' in target:
+            boxes = target['boxes']
+            scaled_boxes = boxes * torch.as_tensor(
+                [ratio_width, ratio_height, ratio_width, ratio_height])
+            target['boxes'] = scaled_boxes
+
+        if 'area' in target:
+            area = target['area']
+            scaled_area = area * (ratio_width * ratio_height)
+            target['area'] = scaled_area
+
+        if 'masks' in target:
+            assert False
+            masks = target['masks']
+            # masks = interpolate(
+            #     masks[:, None].float(), scaled_size, mode="nearest")[:, 0] > 0.5
+            target['masks'] = masks
+        return target
+
+    def crop_target(self, region, target):
+        i, j, h, w = region
+        fields = ['labels', 'area']
+
+        target = target.copy()
+        target['size'] = torch.tensor([h, w])
+
+        if 'boxes' in target:
+            boxes = target['boxes']
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+            cropped_boxes = torch.min(
+                cropped_boxes.reshape(-1, 2, 2), max_size)
+            cropped_boxes = cropped_boxes.clamp(min=0)
+            area = (cropped_boxes[:, 1, :]
+                    - cropped_boxes[:, 0, :]).prod(dim=1)
+            target['boxes'] = cropped_boxes.reshape(-1, 4)
+            target['area'] = area
+            fields.append('boxes')
+
+        if 'masks' in target:
+            # FIXME should we update the area here if there are no boxes?
+            target['masks'] = target['masks'][:, i:i + h, j:j + w]
+            fields.append('masks')
+
+        # remove elements for which the boxes or masks that have zero area
+        if 'boxes' in target or 'masks' in target:
+            # favor boxes selection when defining which elements to keep
+            # this is compatible with previous implementation
+            if 'boxes' in target:
+                cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+                keep = torch.all(
+                    cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+            else:
+                keep = target['masks'].flatten(1).any(1)
+
+            for field in fields:
+                target[field] = target[field][keep.tolist()]
+        return target
+
+    def pad_target(self, padding, target):
+        target = target.copy()
+        if 'masks' in target:
+            target['masks'] = torch.nn.functional.pad(
+                target['masks'], (0, padding[1], 0, padding[0]))
+        return target
+
+    def __call__(self, image, target=None):
+        image_size = image.size
+        image_size = torch.tensor(image_size[::-1])
+
+        random_scale = torch.rand(1) * (
+            self.aug_scale_max - self.aug_scale_min) + self.aug_scale_min
+        scaled_size = (random_scale * self.desired_size).round()
+
+        scale = torch.maximum(scaled_size / image_size[0],
+                              scaled_size / image_size[1])
+        scaled_size = (image_size * scale).round().int()
+
+        scaled_image = F.resize(
+            image, scaled_size.tolist(), interpolation=Image.BICUBIC)
+
+        if target is not None:
+            target = self.rescale_target(scaled_size, image_size, target)
+
+        # randomly crop or pad images
+        if random_scale >= 1:
+            # Selects non-zero random offset (x, y) if scaled image is larger than desired_size.
+            max_offset = scaled_size - self.desired_size
+            offset = (max_offset * torch.rand(2)).floor().int()
+            region = (offset[0].item(), offset[1].item(),
+                      self.desired_size[0].item(), self.desired_size[0].item())
+            output_image = F.crop(scaled_image, *region)
+            if target is not None:
+                target = self.crop_target(region, target)
+        else:
+            assert False
+            padding = self.desired_size - scaled_size
+            output_image = F.pad(scaled_image,
+                                 [0, 0, padding[1].item(), padding[0].item()])
+            if target is not None:
+                target = self.pad_target(padding, target)
+
+        return output_image, target
+
+
+class OriginLargeScaleJitter(object):
+    """
+        implementation of large scale jitter from copy_paste
+    """
+
+    def __init__(self, output_size=512, aug_scale_min=0.3, aug_scale_max=2.0):
+        self.desired_size = torch.tensor(output_size)
+        self.aug_scale_min = aug_scale_min
+        self.aug_scale_max = aug_scale_max
+
+    def rescale_target(self, scaled_size, image_size, target):
+        # compute rescaled targets
+        image_scale = scaled_size / image_size
+        ratio_height, ratio_width = image_scale
+
+        target = target.copy()
+        target['size'] = scaled_size
+
+        if 'boxes' in target:
+            boxes = target['boxes']
+            scaled_boxes = boxes * torch.as_tensor(
+                [ratio_width, ratio_height, ratio_width, ratio_height])
+            target['boxes'] = scaled_boxes
+
+        if 'area' in target:
+            area = target['area']
+            scaled_area = area * (ratio_width * ratio_height)
+            target['area'] = scaled_area
+
+        if 'masks' in target:
+            assert False
+            masks = target['masks']
+            # masks = interpolate(
+            #     masks[:, None].float(), scaled_size, mode="nearest")[:, 0] > 0.5
+            target['masks'] = masks
+        return target
+
+    def crop_target(self, region, target):
+        i, j, h, w = region
+        fields = ['labels', 'area']
+
+        target = target.copy()
+        target['size'] = torch.tensor([h, w])
+
+        if 'boxes' in target:
+            boxes = target['boxes']
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+            cropped_boxes = torch.min(
+                cropped_boxes.reshape(-1, 2, 2), max_size)
+            cropped_boxes = cropped_boxes.clamp(min=0)
+            area = (cropped_boxes[:, 1, :]
+                    - cropped_boxes[:, 0, :]).prod(dim=1)
+            target['boxes'] = cropped_boxes.reshape(-1, 4)
+            target['area'] = area
+            fields.append('boxes')
+
+        if 'masks' in target:
+            # FIXME should we update the area here if there are no boxes?
+            target['masks'] = target['masks'][:, i:i + h, j:j + w]
+            fields.append('masks')
+
+        # remove elements for which the boxes or masks that have zero area
+        if 'boxes' in target or 'masks' in target:
+            # favor boxes selection when defining which elements to keep
+            # this is compatible with previous implementation
+            if 'boxes' in target:
+                cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+                keep = torch.all(
+                    cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+            else:
+                keep = target['masks'].flatten(1).any(1)
+
+            for field in fields:
+                target[field] = target[field][keep.tolist()]
+        return target
+
+    def pad_target(self, padding, target):
+        target = target.copy()
+        if 'masks' in target:
+            target['masks'] = torch.nn.functional.pad(
+                target['masks'], (0, padding[1], 0, padding[0]))
+        return target
+
+    def __call__(self, image, target=None):
+        image_size = image.size
+        image_size = torch.tensor(image_size[::-1])
+
+        out_desired_size = (self.desired_size * image_size
+                            / max(image_size)).round().int()
+
+        random_scale = torch.rand(1) * (
+            self.aug_scale_max - self.aug_scale_min) + self.aug_scale_min
+        scaled_size = (random_scale * self.desired_size).round()
+
+        scale = torch.minimum(scaled_size / image_size[0],
+                              scaled_size / image_size[1])
+        scaled_size = (image_size * scale).round().int()
+
+        scaled_image = F.resize(image, scaled_size.tolist())
+
+        if target is not None:
+            target = self.rescale_target(scaled_size, image_size, target)
+
+        # randomly crop or pad images
+        if random_scale > 1:
+            # Selects non-zero random offset (x, y) if scaled image is larger than desired_size.
+            max_offset = scaled_size - out_desired_size
+            offset = (max_offset * torch.rand(2)).floor().int()
+            region = (offset[0].item(), offset[1].item(),
+                      out_desired_size[0].item(), out_desired_size[1].item())
+            output_image = F.crop(scaled_image, *region)
+            if target is not None:
+                target = self.crop_target(region, target)
+        else:
+            padding = out_desired_size - scaled_size
+            output_image = F.pad(scaled_image,
+                                 [0, 0, padding[1].item(), padding[0].item()])
+            if target is not None:
+                target = self.pad_target(padding, target)
+
+        return output_image, target
+
+
+class RandomDistortion(object):
+    """
+    Distort image w.r.t hue, saturation and exposure.
+    """
+
+    def __init__(self,
+                 brightness=0,
+                 contrast=0,
+                 saturation=0,
+                 hue=0,
+                 prob=0.5):
+        self.prob = prob
+        self.tfm = T.ColorJitter(brightness, contrast, saturation, hue)
+
+    def __call__(self, img, target=None):
+        if np.random.random() < self.prob:
+            return self.tfm(img), target
+        else:
+            return img, target
--- a/modelscope/preprocessors/ofa/utils/vision_helper.py
+++ b/modelscope/preprocessors/ofa/utils/vision_helper.py
@@ -0,0 +1,357 @@
+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+
+import cv2
+import numpy as np
+
+
+def identity_func(img):
+    return img
+
+
+def autocontrast_func(img, cutoff=0):
+    '''
+        same output as PIL.ImageOps.autocontrast
+    '''
+    n_bins = 256
+
+    def tune_channel(ch):
+        n = ch.size
+        cut = cutoff * n // 100
+        if cut == 0:
+            high, low = ch.max(), ch.min()
+        else:
+            hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+            low = np.argwhere(np.cumsum(hist) > cut)
+            low = 0 if low.shape[0] == 0 else low[0]
+            high = np.argwhere(np.cumsum(hist[::-1]) > cut)
+            high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0]
+        if high <= low:
+            table = np.arange(n_bins)
+        else:
+            scale = (n_bins - 1) / (high - low)
+            offset = -low * scale
+            table = np.arange(n_bins) * scale + offset
+            table[table < 0] = 0
+            table[table > n_bins - 1] = n_bins - 1
+        table = table.clip(0, 255).astype(np.uint8)
+        return table[ch]
+
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+
+
+def equalize_func(img):
+    '''
+        same output as PIL.ImageOps.equalize
+        PIL's implementation is different from cv2.equalize
+    '''
+    n_bins = 256
+
+    def tune_channel(ch):
+        hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+        non_zero_hist = hist[hist != 0].reshape(-1)
+        step = np.sum(non_zero_hist[:-1]) // (n_bins - 1)
+        if step == 0:
+            return ch
+        n = np.empty_like(hist)
+        n[0] = step // 2
+        n[1:] = hist[:-1]
+        table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8)
+        return table[ch]
+
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+
+
+def rotate_func(img, degree, fill=(0, 0, 0)):
+    '''
+    like PIL, rotate by degree, not radians
+    '''
+    H, W = img.shape[0], img.shape[1]
+    center = W / 2, H / 2
+    M = cv2.getRotationMatrix2D(center, degree, 1)
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill)
+    return out
+
+
+def solarize_func(img, thresh=128):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    table = np.array([el if el < thresh else 255 - el for el in range(256)])
+    table = table.clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def color_func(img, factor):
+    # same output as PIL.ImageEnhance.Color
+    M = (
+        np.float32([[0.886, -0.114, -0.114], [-0.587, 0.413, -0.587],
+                    [-0.299, -0.299, 0.701]]) * factor
+        + np.float32([[0.114], [0.587], [0.299]]))
+    out = np.matmul(img, M).clip(0, 255).astype(np.uint8)
+    return out
+
+
+def contrast_func(img, factor):
+    """
+        same output as PIL.ImageEnhance.Contrast
+    """
+    mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299]))
+    table = np.array([(el - mean) * factor + mean
+                      for el in range(256)]).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def brightness_func(img, factor):
+    '''
+        same output as PIL.ImageEnhance.Contrast
+    '''
+    table = (np.arange(256, dtype=np.float32) * factor).clip(0, 255).astype(
+        np.uint8)
+    out = table[img]
+    return out
+
+
+def sharpness_func(img, factor):
+    '''
+    The differences the this result and PIL are all on the 4 boundaries, the center
+    areas are same
+    '''
+    kernel = np.ones((3, 3), dtype=np.float32)
+    kernel[1][1] = 5
+    kernel /= 13
+    degenerate = cv2.filter2D(img, -1, kernel)
+    if factor == 0.0:
+        out = degenerate
+    elif factor == 1.0:
+        out = img
+    else:
+        out = img.astype(np.float32)
+        degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :]
+        out[1:-1, 1:-1, :] = degenerate + factor * (
+            out[1:-1, 1:-1, :] - degenerate)
+        out = out.astype(np.uint8)
+    return out
+
+
+def shear_x_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, factor, 0], [0, 1, 0]])
+    out = cv2.warpAffine(
+        img, M, (W, H), borderValue=fill,
+        flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def translate_x_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, -offset], [0, 1, 0]])
+    out = cv2.warpAffine(
+        img, M, (W, H), borderValue=fill,
+        flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def translate_y_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [0, 1, -offset]])
+    out = cv2.warpAffine(
+        img, M, (W, H), borderValue=fill,
+        flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def posterize_func(img, bits):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    out = np.bitwise_and(img, np.uint8(255 << (8 - bits)))
+    return out
+
+
+def shear_y_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [factor, 1, 0]])
+    out = cv2.warpAffine(
+        img, M, (W, H), borderValue=fill,
+        flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def cutout_func(img, pad_size, replace=(0, 0, 0)):
+    replace = np.array(replace, dtype=np.uint8)
+    H, W = img.shape[0], img.shape[1]
+    rh, rw = np.random.random(2)
+    pad_size = pad_size // 2
+    ch, cw = int(rh * H), int(rw * W)
+    x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H)
+    y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W)
+    out = img.copy()
+    out[x1:x2, y1:y2, :] = replace
+    return out
+
+
+# level to args
+def enhance_level_to_args(MAX_LEVEL):
+
+    def level_to_args(level):
+        return ((level / MAX_LEVEL) * 1.8 + 0.1, )
+
+    return level_to_args
+
+
+def shear_level_to_args(MAX_LEVEL, replace_value):
+
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 0.3
+        if np.random.random() > 0.5:
+            level = -level
+        return level, replace_value
+
+    return level_to_args
+
+
+def translate_level_to_args(translate_const, MAX_LEVEL, replace_value):
+
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * float(translate_const)
+        if np.random.random() > 0.5:
+            level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value):
+
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * cutout_const)
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def solarize_level_to_args(MAX_LEVEL):
+
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 256)
+        return (level, )
+
+    return level_to_args
+
+
+def none_level_to_args(level):
+    return ()
+
+
+def posterize_level_to_args(MAX_LEVEL):
+
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 4)
+        return (level, )
+
+    return level_to_args
+
+
+def rotate_level_to_args(MAX_LEVEL, replace_value):
+
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 30
+        if np.random.random() < 0.5:
+            level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+func_dict = {
+    'Identity': identity_func,
+    'AutoContrast': autocontrast_func,
+    'Equalize': equalize_func,
+    'Rotate': rotate_func,
+    'Solarize': solarize_func,
+    'Color': color_func,
+    'Contrast': contrast_func,
+    'Brightness': brightness_func,
+    'Sharpness': sharpness_func,
+    'ShearX': shear_x_func,
+    'TranslateX': translate_x_func,
+    'TranslateY': translate_y_func,
+    'Posterize': posterize_func,
+    'ShearY': shear_y_func,
+}
+
+translate_const = 10
+MAX_LEVEL = 10
+replace_value = (128, 128, 128)
+arg_dict = {
+    'Identity':
+    none_level_to_args,
+    'AutoContrast':
+    none_level_to_args,
+    'Equalize':
+    none_level_to_args,
+    'Rotate':
+    rotate_level_to_args(MAX_LEVEL, replace_value),
+    'Solarize':
+    solarize_level_to_args(MAX_LEVEL),
+    'Color':
+    enhance_level_to_args(MAX_LEVEL),
+    'Contrast':
+    enhance_level_to_args(MAX_LEVEL),
+    'Brightness':
+    enhance_level_to_args(MAX_LEVEL),
+    'Sharpness':
+    enhance_level_to_args(MAX_LEVEL),
+    'ShearX':
+    shear_level_to_args(MAX_LEVEL, replace_value),
+    'TranslateX':
+    translate_level_to_args(translate_const, MAX_LEVEL, replace_value),
+    'TranslateY':
+    translate_level_to_args(translate_const, MAX_LEVEL, replace_value),
+    'Posterize':
+    posterize_level_to_args(MAX_LEVEL),
+    'ShearY':
+    shear_level_to_args(MAX_LEVEL, replace_value),
+}
+
+
+class RandomAugment(object):
+
+    def __init__(self, N=2, M=10, isPIL=False, augs=[]):
+        self.N = N
+        self.M = M
+        self.isPIL = isPIL
+        if augs:
+            self.augs = augs
+        else:
+            self.augs = list(arg_dict.keys())
+
+    def get_random_ops(self):
+        sampled_ops = np.random.choice(self.augs, self.N)
+        return [(op, 0.5, self.M) for op in sampled_ops]
+
+    def __call__(self, img):
+        if self.isPIL:
+            img = np.array(img)
+        ops = self.get_random_ops()
+        for name, prob, level in ops:
+            if np.random.random() > prob:
+                continue
+            args = arg_dict[name](level)
+            img = func_dict[name](img, *args)
+        return img
--- a/modelscope/preprocessors/ofa/visual_entailment.py
+++ b/modelscope/preprocessors/ofa/visual_entailment.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.preprocessors.image import load_image
+from .base import OfaBasePreprocessor
+
+
+class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
+
+    def __init__(self, cfg, model_dir):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path
+        """
+        super(OfaVisualEntailmentPreprocessor, self).__init__(cfg, model_dir)
+        # Initialize transform
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: image.convert('RGB'),
+            transforms.Resize((self.patch_image_size, self.patch_image_size),
+                              interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=self.mean, std=self.std),
+        ])
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = data['image'] if isinstance(
+            data['image'], Image.Image) else load_image(data['image'])
+        patch_image = self.patch_resize_transform(image)
+        if 'text2' not in data:
+            hypothesis = self.pre_caption(data['text'], self.max_src_length)
+            prompt = self.cfg.model.get('prompt',
+                                        ' does the image describe " {} "?')
+            text = prompt.format(hypothesis)
+        else:
+            assert 'text' in data, f'text must be in the input {data.keys()}'
+            caption = self.pre_caption(data['text2'], self.max_src_length)
+            hypothesis = self.pre_caption(data['text'], self.max_src_length)
+            prompt = self.cfg.model.get(
+                'prompt', ' can image and text1 " {} " imply text2 " {} "?')
+            text = prompt.format(caption, hypothesis)
+        inputs = self.get_inputs(text)
+        if self.prompt_type == 'none':
+            decoder_prompt = self.bos_item
+        elif self.prompt_type == 'src':
+            decoder_prompt = inputs
+        elif self.prompt_type == 'prev_output':
+            decoder_prompt = inputs[:-1]
+        else:
+            raise NotImplementedError
+        sample = {
+            'source': inputs,
+            'patch_image': patch_image,
+            'patch_mask': torch.tensor([True]),
+            'decoder_prompt': decoder_prompt,
+        }
+        return sample
--- a/modelscope/preprocessors/ofa/visual_grounding.py
+++ b/modelscope/preprocessors/ofa/visual_grounding.py
@@ -0,0 +1,50 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.preprocessors.image import load_image
+from .base import OfaBasePreprocessor
+
+
+class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
+
+    def __init__(self, cfg, model_dir):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path
+        """
+        super(OfaVisualGroundingPreprocessor, self).__init__(cfg, model_dir)
+        # Initialize transform
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: image.convert('RGB'),
+            transforms.Resize((self.patch_image_size, self.patch_image_size),
+                              interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=self.mean, std=self.std),
+        ])
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = data['image'] if isinstance(
+            data['image'], Image.Image) else load_image(data['image'])
+        w, h = image.size
+        patch_image = self.patch_resize_transform(image)
+        w_resize_ratio = torch.tensor(self.patch_image_size / w)
+        h_resize_ratio = torch.tensor(self.patch_image_size / h)
+        src_caption = self.pre_caption(data['text'], self.max_src_length)
+        prompt = self.cfg.model.get(
+            'prompt', ' which region does the text " {} " describe?')
+        text = prompt.format(src_caption)
+        src_item = self.get_inputs(text)
+        sample = {
+            'source': src_item,
+            'patch_image': patch_image,
+            'patch_mask': torch.tensor([True]),
+            'w_resize_ratio': w_resize_ratio,
+            'h_resize_ratio': h_resize_ratio,
+        }
+        return sample
--- a/modelscope/preprocessors/ofa/visual_question_answering.py
+++ b/modelscope/preprocessors/ofa/visual_question_answering.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.preprocessors.image import load_image
+from .base import OfaBasePreprocessor
+
+
+class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
+
+    def __init__(self, cfg, model_dir):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path
+        """
+        super(OfaVisualQuestionAnsweringPreprocessor,
+              self).__init__(cfg, model_dir)
+        # Initialize transform
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: image.convert('RGB'),
+            transforms.Resize((self.patch_image_size, self.patch_image_size),
+                              interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=self.mean, std=self.std),
+        ])
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = data['image'] if isinstance(
+            data['image'], Image.Image) else load_image(data['image'])
+        patch_image = self.patch_resize_transform(image)
+        text = ' {}'.format(data['text'])
+        inputs = self.get_inputs(text)
+        if self.prompt_type == 'none':
+            decoder_prompt = self.bos_item
+        elif self.prompt_type == 'src':
+            decoder_prompt = inputs
+        elif self.prompt_type == 'prev_output':
+            decoder_prompt = inputs[:-1]
+        else:
+            raise NotImplementedError
+        sample = {
+            'source': inputs,
+            'patch_image': patch_image,
+            'patch_mask': torch.tensor([True]),
+            'decoder_prompt': decoder_prompt,
+        }
+        return sample
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -85,6 +85,7 @@ class MultiModalTasks(object):
    multi_modal_embedding = 'multi-modal-embedding'
    generative_multi_modal_embedding = 'generative-multi-modal-embedding'
    visual_question_answering = 'visual-question-answering'
+    visual_entailment = 'visual-entailment'
    video_multi_modal_embedding = 'video-multi-modal-embedding'


--- a/modelscope/utils/trie.py
+++ b/modelscope/utils/trie.py
@@ -0,0 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from collections import defaultdict
+
+
+class TreeNode:
+
+    def __init__(self):
+        self.child = defaultdict(TreeNode)
+
+
+class Trie:
+
+    def __init__(self, eos):
+        self.root = TreeNode()
+        self.eos = eos
+
+    def insert(self, word):
+        cur = self.root
+        for c in word:
+            cur = cur.child[c]
+
+    def get_next_layer(self, word):
+        cur = self.root
+        for c in word:
+            cur = cur.child.get(c)
+            if cur is None:
+                return [self.eos]
+        return list(cur.child.keys())
--- a/tests/pipelines/test_image_captioning.py
+++ b/tests/pipelines/test_image_captioning.py
@@ -1,23 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import unittest
-
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.test_utils import test_level
-
-
-class ImageCaptionTest(unittest.TestCase):
-
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_run(self):
-        img_captioning = pipeline(
-            Tasks.image_captioning,
-            model='damo/ofa_image-caption_coco_distilled_en')
-        result = img_captioning('data/test/images/image_captioning.png')
-        print(result[OutputKeys.CAPTION])
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -0,0 +1,179 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class OfaTasksTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_captioning_with_model(self):
+        model = Model.from_pretrained(
+            'damo/ofa_image-caption_coco_distilled_en')
+        img_captioning = pipeline(
+            task=Tasks.image_captioning,
+            model=model,
+        )
+        result = img_captioning(
+            {'image': 'data/test/images/image_captioning.png'})
+        print(result[OutputKeys.CAPTION])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_captioning_with_name(self):
+        img_captioning = pipeline(
+            Tasks.image_captioning,
+            model='damo/ofa_image-caption_coco_distilled_en')
+        result = img_captioning(
+            {'image': 'data/test/images/image_captioning.png'})
+        print(result[OutputKeys.CAPTION])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_classification_with_model(self):
+        model = Model.from_pretrained(
+            'damo/ofa_image-classification_imagenet_large_en')
+        ofa_pipe = pipeline(Tasks.image_classification, model=model)
+        image = 'data/test/images/image_classification.png'
+        input = {'image': image}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_classification_with_name(self):
+        ofa_pipe = pipeline(
+            Tasks.image_classification,
+            model='damo/ofa_image-classification_imagenet_large_en')
+        image = 'data/test/images/image_classification.png'
+        input = {'image': image}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_summarization_with_model(self):
+        model = Model.from_pretrained(
+            'damo/ofa_summarization_gigaword_large_en')
+        ofa_pipe = pipeline(Tasks.summarization, model=model)
+        text = 'five-time world champion michelle kwan withdrew' + \
+               'from the #### us figure skating championships on wednesday ,' + \
+               ' but will petition us skating officials for the chance to ' + \
+               'compete at the #### turin olympics .'
+        input = {'text': text}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_summarization_with_name(self):
+        ofa_pipe = pipeline(
+            Tasks.summarization,
+            model='damo/ofa_summarization_gigaword_large_en')
+        text = 'five-time world champion michelle kwan withdrew' + \
+               'from the #### us figure skating championships on wednesday ,' + \
+               ' but will petition us skating officials for the chance to ' +\
+               'compete at the #### turin olympics .'
+        input = {'text': text}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_text_classification_with_model(self):
+        model = Model.from_pretrained(
+            'damo/ofa_text-classification_mnli_large_en')
+        ofa_pipe = pipeline(Tasks.text_classification, model=model)
+        text = 'One of our number will carry out your instructions minutely.'
+        text2 = 'A member of my team will execute your orders with immense precision.'
+        input = {'text': text, 'text2': text2}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_text_classification_with_name(self):
+        ofa_pipe = pipeline(
+            Tasks.text_classification,
+            model='damo/ofa_text-classification_mnli_large_en')
+        text = 'One of our number will carry out your instructions minutely.'
+        text2 = 'A member of my team will execute your orders with immense precision.'
+        input = {'text': text, 'text2': text2}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_visual_entailment_with_model(self):
+        model = Model.from_pretrained(
+            'damo/ofa_visual-entailment_snli-ve_large_en')
+        ofa_pipe = pipeline(Tasks.visual_entailment, model=model)
+        image = 'data/test/images/dogs.jpg'
+        text = 'there are two birds.'
+        input = {'image': image, 'text': text}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_visual_entailment_with_name(self):
+        ofa_pipe = pipeline(
+            Tasks.visual_entailment,
+            model='damo/ofa_visual-entailment_snli-ve_large_en')
+        image = 'data/test/images/dogs.jpg'
+        text = 'there are two birds.'
+        input = {'image': image, 'text': text}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_visual_grounding_with_model(self):
+        model = Model.from_pretrained(
+            'damo/ofa_visual-grounding_refcoco_large_en')
+        ofa_pipe = pipeline(Tasks.visual_grounding, model=model)
+        image = 'data/test/images/visual_grounding.png'
+        text = 'a blue turtle-like pokemon with round head'
+        input = {'image': image, 'text': text}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_visual_grounding_with_name(self):
+        ofa_pipe = pipeline(
+            Tasks.visual_grounding,
+            model='damo/ofa_visual-grounding_refcoco_large_en')
+        image = 'data/test/images/visual_grounding.png'
+        text = 'a blue turtle-like pokemon with round head'
+        input = {'image': image, 'text': text}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_visual_question_answering_with_model(self):
+        from modelscope.preprocessors.multi_modal import OfaPreprocessor
+        model = Model.from_pretrained(
+            'damo/ofa_visual-question-answering_pretrain_large_en')
+        preprocessor = OfaPreprocessor(model_dir=model.model_dir)
+        ofa_pipe = pipeline(
+            Tasks.visual_question_answering,
+            model=model,
+            preprocessor=preprocessor)
+        image = 'data/test/images/visual_question_answering.png'
+        text = 'what is grown on the plant?'
+        input = {'image': image, 'text': text}
+        result = ofa_pipe(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_visual_question_answering_with_name(self):
+        from modelscope.preprocessors.multi_modal import OfaPreprocessor
+        model = 'damo/ofa_visual-question-answering_pretrain_large_en'
+        preprocessor = OfaPreprocessor(model_dir=model)
+        ofa_pipe = pipeline(
+            Tasks.visual_question_answering,
+            model=model,
+            preprocessor=preprocessor)
+        image = 'data/test/images/visual_question_answering.png'
+        text = 'what is grown on the plant?'
+        input = {'image': image, 'text': text}
+        result = ofa_pipe(input)
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()