Add HiTeA model for VideoQA and Caption (12.30)

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11201652
2026-05-18 05:05:00 +02:00 · 2022-12-29 08:06:34 +08:00
parent f58060b140
commit f7a7504782
18 changed files with 1823 additions and 13 deletions
--- a/data/test/videos/video_caption_and_qa_test.mp4
+++ b/data/test/videos/video_caption_and_qa_test.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c822c66fcf04de28016b224ef372cb1c93b7f13f2cba4e11f53a37fec8e769e
+size 828272
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -120,6 +120,7 @@ class Models(object):
    multi_stage_diffusion = 'multi-stage-diffusion-text-to-image-synthesis'
    team = 'team-multi-modal-similarity'
    video_clip = 'video-clip-multi-modal-embedding'
+    hitea = 'hitea'

    # science models
    unifold = 'unifold'
@@ -322,6 +323,8 @@ class Pipelines(object):
    image_text_retrieval = 'image-text-retrieval'
    ofa_ocr_recognition = 'ofa-ocr-recognition'
    ofa_asr = 'ofa-asr'
+    video_captioning = 'video-captioning'
+    video_question_answering = 'video-question-answering'

    # science tasks
    protein_structure = 'unifold-protein-structure'
@@ -446,6 +449,7 @@ class Preprocessors(object):
    ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
    clip_preprocessor = 'clip-preprocessor'
    mplug_tasks_preprocessor = 'mplug-tasks-preprocessor'
+    hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'

    # science preprocessor
    unifold_preprocessor = 'unifold-preprocessor'
--- a/modelscope/models/multi_modal/init.py
+++ b/modelscope/models/multi_modal/init.py
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
    from .team import TEAMForMultiModalSimilarity
    from .diffusion import DiffusionForTextToImageSynthesis
    from .mmr import VideoCLIPForMultiModalEmbedding
-    from .mplug_for_all_tasks import MPlugForAllTasks
+    from .mplug_for_all_tasks import MPlugForAllTasks, HiTeAForAllTasks
    from .ofa_for_all_tasks import OfaForAllTasks
    from .ofa_for_text_to_image_synthesis_model import \
        OfaForTextToImageSynthesis
@@ -24,7 +24,7 @@ else:
        'gemm': ['GEMMForMultiModalEmbedding'],
        'team': ['TEAMForMultiModalSimilarity'],
        'mmr': ['VideoCLIPForMultiModalEmbedding'],
-        'mplug_for_all_tasks': ['MPlugForAllTasks'],
+        'mplug_for_all_tasks': ['MPlugForAllTasks', 'HiTeAForAllTasks'],
        'ofa_for_all_tasks': ['OfaForAllTasks'],
        'ofa_for_text_to_image_synthesis_model':
        ['OfaForTextToImageSynthesis'],
--- a/modelscope/models/multi_modal/mplug/init.py
+++ b/modelscope/models/multi_modal/mplug/init.py
@@ -13,5 +13,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .configuration_mplug import MPlugConfig
-from .modeling_mplug import CONFIG_NAME, MPlug
+from .configuration_mplug import HiTeAConfig, MPlugConfig
+from .modeling_mplug import CONFIG_NAME, HiTeA, MPlug
--- a/modelscope/models/multi_modal/mplug/configuration_mplug.py
+++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py
@@ -114,3 +114,67 @@ class MPlugConfig(PretrainedConfig):
        with open(yaml_file, 'r', encoding='utf-8') as reader:
            config_dict = yaml.load(reader, Loader=yaml.Loader)
        return cls(**config_dict)
+
+
+class HiTeAConfig(PretrainedConfig):
+
+    model_type = 'hitea'
+
+    def __init__(
+            self,
+            task=Tasks.video_question_answering,
+            bert_config='config_bert.json',
+            image_res=224,
+            num_frames=16,
+            batch_size_train=32,
+            vision_width=768,
+            distill=True,
+            batch_size_test=64,
+            k_test=128,
+            alpha=0.4,
+            warm_up=True,
+            eos='[SEP]',
+            optimizer=None,
+            schedular=None,
+            min_length=1,
+            max_length=10,
+            beam_size=5,
+            text_encoder='bert-base-uncased',
+            text_decoder='bert-base-uncased',
+            # retrieval
+            queue_size=65536,
+            embed_dim=256,
+            temp=0.07,
+            **kwargs):
+
+        super().__init__(**kwargs)
+        self.task = task
+        self.bert_config = bert_config
+        self.image_res = image_res
+        self.num_frames = num_frames
+        self.batch_size_train = batch_size_train
+        self.vision_width = vision_width
+        self.distill = distill
+        self.batch_size_test = batch_size_test
+        self.k_test = k_test
+        self.alpha = alpha
+        self.warm_up = warm_up
+        self.eos = eos
+        self.optimizer = optimizer
+        self.schedular = schedular
+        self.min_length = min_length
+        self.max_length = max_length
+        self.beam_size = beam_size
+        self.text_encoder = text_encoder
+        self.text_decoder = text_decoder
+        # retrieval
+        self.queue_size = queue_size
+        self.embed_dim = embed_dim
+        self.temp = temp
+
+    @classmethod
+    def from_yaml_file(cls, yaml_file: Union[str,
+                                             os.PathLike]) -> Dict[str, Any]:
+        with open(yaml_file, 'r', encoding='utf-8') as reader:
+            config_dict = yaml.load(reader, Loader=yaml.Loader)
+        return cls(**config_dict)
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -40,7 +40,9 @@ from transformers.modeling_utils import (PreTrainedModel,
                                         prune_linear_layer)
 from transformers.utils import logging

-from modelscope.models.multi_modal.mplug.configuration_mplug import MPlugConfig
+from modelscope.models.multi_modal.mplug.configuration_mplug import (
+    HiTeAConfig, MPlugConfig)
+from modelscope.models.multi_modal.mplug.mvit import MViTv2, MViTv2_Base_config
 from modelscope.models.multi_modal.mplug.predictor import TextGenerator
 from modelscope.utils.constant import ModelFile

@@ -2483,3 +2485,322 @@ class MPlugForImageTextRetrieval(MPlug):
            scores = F.softmax(scores, dim=-1)

            return scores
+
+
+class HiTeA(PreTrainedModel):
+    config_class = HiTeAConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.tokenizer = BertTokenizer.from_pretrained(
+            os.path.join(config.model_dir, ModelFile.VOCAB_FILE))
+        self.module_setting(config)
+        self.visual_encoder = MViTv2(
+            img_size=config.image_res,
+            config=MViTv2_Base_config,
+            num_frames=config.num_frames)
+        self.text_encoder = BertModel(
+            self.config_encoder, add_pooling_layer=False)
+        self.fusion_encoder = FusionModel(
+            self.config_fusion, add_pooling_layer=False)
+
+    @classmethod
+    def from_pretrained(cls, model_dir, load_checkpoint=True):
+        from modelscope.utils.constant import Tasks
+
+        task_mapping = {
+            Tasks.video_question_answering: HiTeAForVideoQuestionAnswering,
+            Tasks.video_captioning: HiTeAForVideoCaption,
+        }
+        config = cls.config_class.from_yaml_file(
+            os.path.join(model_dir, CONFIG_NAME))
+        config.model_dir = model_dir
+        model = task_mapping[config.task](config)
+        if load_checkpoint:
+            checkpoint_path = os.path.join(model_dir,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
+            checkpoint = torch.load(checkpoint_path, map_location='cpu')
+            if 'model' in checkpoint:
+                checkpoint = checkpoint['model']
+            if 'module' in checkpoint:
+                checkpoint = checkpoint['module']
+            checkpoint = {
+                k.replace('model.', ''): v
+                for k, v in checkpoint.items()
+            }
+
+            model.load_state_dict(checkpoint, strict=False)
+        return model
+
+    def init_distill(self, config):
+        self.distill = config.distill
+        if self.distill:
+            self.visual_encoder_m = MViTv2(
+                img_size=config.image_res,
+                config=MViTv2_Base_config,
+                num_frames=config.num_frames)
+            self.text_encoder_m = BertModel(
+                self.config_encoder, add_pooling_layer=False)
+            self.fusion_encoder_m = FusionModel(
+                self.config_fusion, add_pooling_layer=False)
+            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
+            self.model_pairs = [
+                [self.visual_encoder, self.visual_encoder_m],
+                [self.text_encoder, self.text_encoder_m],
+                [self.text_decoder, self.text_decoder_m],
+            ]
+            self.copy_params()
+            self.momentum = 0.995
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def module_setting(self, config):
+        bert_config_path = os.path.join(config.model_dir, config.bert_config)
+        self.config_encoder = BertConfig.from_json_file(bert_config_path)
+        self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
+        self.config_fusion = BertConfig.from_json_file(bert_config_path)
+        self.config_decoder = BertConfig.from_json_file(bert_config_path)
+        self.config_decoder.add_cross_attention = True
+        self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
+
+    @torch.no_grad()
+    def copy_params(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(),
+                                      model_pair[1].parameters()):
+                param_m.data.copy_(param.data)  # initialize
+                param_m.requires_grad = False  # not update by gradient
+
+    @torch.no_grad()
+    def _momentum_update(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(),
+                                      model_pair[1].parameters()):
+                param_m.data = param_m.data * self.momentum + param.data * (
+                    1. - self.momentum)
+
+    def generation(self, question_states, question_atts, out_size=1):
+        encoder_inputs = [question_states, question_atts]
+        topk_ids, topk_scores = self.beam_generator.translate_batch(
+            encoder_inputs, out_size=out_size)
+        return topk_ids, topk_scores
+
+    @staticmethod
+    def _tile(x, dim, n_tile):
+        import numpy as np
+        init_dim = x.size(dim)
+        repeat_idx = [1] * x.dim()
+        repeat_idx[dim] = n_tile
+        x = x.repeat(*(repeat_idx))
+        order_index = torch.LongTensor(
+            np.concatenate(
+                [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
+        return torch.index_select(x, dim, order_index.to(x.device))
+
+
+class HiTeAForVideoQuestionAnswering(HiTeA):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_decoder = BertLMHeadModel(self.config_decoder)
+        self.beam_generator = TextGenerator(config, self.text_decoder)
+        self.init_distill(config)
+
+    def forward(self,
+                video,
+                question,
+                answer=None,
+                alpha=0,
+                k=None,
+                weights=None,
+                train=True):
+        video = video.to(dtype=next(self.parameters()).dtype)
+        video_embeds = self.visual_encoder(video)
+        video_atts = torch.ones(
+            video_embeds.size()[:-1], dtype=torch.long).to(video.device)
+
+        if train:
+            '''
+            k: number of answers for each question
+            weights: weight for each answer
+            '''
+            answer_targets = answer.input_ids.masked_fill(
+                answer.input_ids == self.tokenizer.pad_token_id, -100)
+            text_output = self.text_encoder(
+                question.input_ids,
+                attention_mask=question.attention_mask,
+                return_dict=True)
+            text_embeds = text_output.last_hidden_state
+            fusion_output = self.fusion_encoder(
+                encoder_embeds=text_embeds,
+                attention_mask=question.attention_mask,
+                encoder_hidden_states=video_embeds,
+                encoder_attention_mask=video_atts,
+                return_dict=False)
+
+            video_output, question_output = fusion_output
+
+            question_output = torch.cat([video_output, question_output], 1)
+            merge_text_attention = torch.cat(
+                [video_atts, question.attention_mask], 1)
+
+            if k is None:
+                k = [1] * question_output.shape[0]
+            question_states = []
+            question_atts = []
+            for b, n in enumerate(k):
+                question_states += [question_output[b]] * n
+                question_atts += [merge_text_attention[b]] * n
+            question_states = torch.stack(question_states, 0)
+            question_atts = torch.stack(question_atts, 0)
+
+            if self.distill:
+                with torch.no_grad():
+                    self._momentum_update()
+                    video_embeds_m = self.visual_encoder_m(video)
+                    text_output_m = self.text_encoder_m(
+                        question.input_ids,
+                        attention_mask=question.attention_mask,
+                        return_dict=True)
+                    text_embeds_m = text_output_m.last_hidden_state
+                    fusion_output_m = self.fusion_encoder_m(
+                        encoder_embeds=text_embeds_m,
+                        attention_mask=question.attention_mask,
+                        encoder_hidden_states=video_embeds_m,
+                        encoder_attention_mask=video_atts,
+                        return_dict=False)
+
+                    image_output_m, question_output_m = fusion_output_m
+                    question_output_m = torch.cat(
+                        [image_output_m, question_output_m], 1)
+
+                    question_states_m = []
+                    for b, n in enumerate(k):
+                        question_states_m += [question_output_m[b]] * n
+                    question_states_m = torch.stack(question_states_m, 0)
+
+                    logits_m = self.text_decoder_m(
+                        answer.input_ids,
+                        attention_mask=answer.attention_mask,
+                        encoder_hidden_states=question_states_m,
+                        encoder_attention_mask=question_atts,
+                        return_logits=True,
+                    )
+
+                answer_output = self.text_decoder(
+                    answer.input_ids,
+                    attention_mask=answer.attention_mask,
+                    encoder_hidden_states=question_states,
+                    encoder_attention_mask=question_atts,
+                    labels=answer_targets,
+                    return_dict=True,
+                    soft_labels=F.softmax(logits_m, dim=-1),
+                    reduction='none',
+                )
+            else:
+                answer_output = self.text_decoder(
+                    answer.input_ids,
+                    attention_mask=answer.attention_mask,
+                    encoder_hidden_states=question_states,
+                    encoder_attention_mask=question_atts,
+                    labels=answer_targets,
+                    return_dict=True,
+                    reduction='none',
+                )
+            if weights is None:
+                weights = 1
+            loss = weights * answer_output.loss
+            loss = loss.sum() / video.size(0)
+
+            return loss
+
+        else:
+            text_output = self.text_encoder(
+                question.input_ids,
+                attention_mask=question.attention_mask,
+                return_dict=True)
+            text_embeds = text_output.last_hidden_state
+            fusion_output = self.fusion_encoder(
+                encoder_embeds=text_embeds,
+                attention_mask=question.attention_mask,
+                encoder_hidden_states=video_embeds,
+                encoder_attention_mask=video_atts,
+                return_dict=False)
+            video_output, question_output = fusion_output
+            question_output = torch.cat([video_output, question_output], 1)
+            merge_text_attention = torch.cat(
+                [video_atts, question.attention_mask], 1)
+            topk_ids, topk_probs = self.generation(question_output,
+                                                   merge_text_attention)
+            return topk_ids, topk_probs
+
+
+class HiTeAForVideoCaption(HiTeA):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_decoder = BertPrefixModel(self.config_decoder)
+        self.beam_generator = TextGenerator(config, self.text_decoder)
+
+    def beam_search(self,
+                    video,
+                    question,
+                    answer=None,
+                    train=True,
+                    out_size=5):
+        video_embeds = self.visual_encoder(video)
+        video_atts = torch.ones(
+            video_embeds.size()[:-1], dtype=torch.long).to(video.device)
+        text_output = self.text_encoder(
+            question.input_ids,
+            attention_mask=question.attention_mask,
+            return_dict=True)
+        text_embeds = text_output.last_hidden_state
+        fusion_output = self.fusion_encoder(
+            encoder_embeds=text_embeds,
+            attention_mask=question.attention_mask,
+            encoder_hidden_states=video_embeds,
+            encoder_attention_mask=video_atts,
+            return_dict=False)
+        video_output, question_output = fusion_output
+        question_output = torch.cat([video_output, question_output], 1)
+        merge_text_attention = torch.cat([video_atts, question.attention_mask],
+                                         1)
+        topk_ids, topk_probs = self.generation(
+            question_output, merge_text_attention, out_size=out_size)
+        return topk_ids, topk_probs
+
+    def forward(self,
+                video,
+                question,
+                answer=None,
+                train=True,
+                out_size=5,
+                scst=False):
+        if (scst):
+            return self.beam_search(
+                video, question, answer, train=True, out_size=out_size)
+        video = video.to(dtype=next(self.parameters()).dtype)
+        video_embeds = self.visual_encoder(video)
+        video_atts = torch.ones(
+            video_embeds.size()[:-1], dtype=torch.long).to(video.device)
+
+        if train:
+            answer_targets = answer.input_ids.masked_fill(
+                answer.input_ids == self.tokenizer.pad_token_id, -100)
+            answer_output = self.text_decoder(
+                answer.input_ids,
+                attention_mask=answer.attention_mask,
+                encoder_hidden_states=video_embeds,
+                encoder_attention_mask=video_atts,
+                labels=answer_targets,
+                return_dict=True,
+                reduction='none')
+            loss = answer_output.loss
+
+            return loss
+        else:
+            topk_ids, topk_probs = self.generation(video_embeds, video_atts)
+            return topk_ids, topk_probs
--- a/modelscope/models/multi_modal/mplug/mvit.py
+++ b/modelscope/models/multi_modal/mplug/mvit.py
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -11,7 +11,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks

-__all__ = ['MPlugForAllTasks']
+__all__ = ['MPlugForAllTasks', 'HiTeAForAllTasks']


@MODELS.register_module(
@@ -81,3 +81,69 @@ class MPlugForAllTasks(TorchModel):
        # evaluate
        topk_ids, _ = output
        return {'sequences': [list_tensor[0] for list_tensor in topk_ids]}
+
+
+@MODELS.register_module(
+    Tasks.video_question_answering, module_name=Models.hitea)
+@MODELS.register_module(Tasks.video_captioning, module_name=Models.hitea)
+class HiTeAForAllTasks(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the hitea model from the `model_dir` path.
+        Args:
+            model_dir (str): the model path.
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+        from modelscope.models.multi_modal.mplug import HiTeA
+        self.model = HiTeA.from_pretrained(model_dir)
+        self.tokenizer = self.model.tokenizer
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+                    {
+                        'predictions': Tensor([[1377, 4959, 2785, 6392...])]),
+                    }
+        """
+
+        # get task from config file
+        task = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION)).task
+
+        # inference
+        if not self.training and 'question' in input:
+            output = self.model(input['video'], input['question'], train=False)
+            topk_ids, _ = output
+            pred_string: List[str] = \
+                self.tokenizer.decode(topk_ids[0][0], skip_special_tokens=True)
+            output_key = OutputKeys.CAPTION \
+                if task == Tasks.video_captioning else OutputKeys.TEXT
+            return {output_key: pred_string}
+
+        # train and evaluate
+        import addict
+        video = input['video']
+        answer = addict.Dict(
+            input_ids=input['answer_input_ids'],
+            attention_mask=input['answer_attention_mask'])
+        if 'index' not in input:
+            question = addict.Dict(
+                input_ids=input['question_input_ids'],
+                attention_mask=input['question_attention_mask'])
+            output = self.model(video, question, answer, train=self.training)
+        else:
+            index = input['index']
+            output = self.model(video, answer, index, train=self.training)
+        if self.training:
+            return {OutputKeys.LOSS: output}
+
+        # evaluate
+        topk_ids, _ = output
+        return {'sequences': [list_tensor[0] for list_tensor in topk_ids]}
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -711,6 +711,12 @@ TASK_OUTPUTS = {
    #   "caption": "this is an image caption text."
    # }
    Tasks.image_captioning: [OutputKeys.CAPTION],
+
+    # video caption result for single sample
+    # {
+    #   "caption": "this is an video caption text."
+    # }
+    Tasks.video_captioning: [OutputKeys.CAPTION],
    Tasks.ocr_recognition: [OutputKeys.TEXT],

    # visual grounding result for single sample
@@ -769,6 +775,10 @@ TASK_OUTPUTS = {
    # {"text": "this is a text answser. "}
    Tasks.visual_question_answering: [OutputKeys.TEXT],

+    # VideoQA result for a sample
+    # {"text": "this is a text answser. "}
+    Tasks.video_question_answering: [OutputKeys.TEXT],
+
    # auto_speech_recognition result for a single sample
    # {
    #    "text": "每天都要快乐喔"
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

+import cv2
 import numpy as np
 from PIL import Image

@@ -222,6 +223,9 @@ TASK_INPUTS = {
    Tasks.image_captioning: [InputType.IMAGE, {
        'image': InputType.IMAGE,
    }],
+    Tasks.video_captioning: [InputType.VIDEO, {
+        'video': InputType.VIDEO,
+    }],
    Tasks.visual_grounding: {
        'image': InputType.IMAGE,
        'text': InputType.TEXT
@@ -245,6 +249,10 @@ TASK_INPUTS = {
        'image': InputType.IMAGE,
        'text': InputType.TEXT
    },
+    Tasks.video_question_answering: {
+        'video': InputType.VIDEO,
+        'text': InputType.TEXT
+    },
    Tasks.visual_entailment: {
        'image': InputType.IMAGE,
        'text': InputType.TEXT,
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -80,6 +80,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     'damo/nlp_bart_text-error-correction_chinese'),
    Tasks.image_captioning: (Pipelines.image_captioning,
                             'damo/ofa_image-caption_coco_large_en'),
+    Tasks.video_captioning:
+    (Pipelines.video_captioning,
+     'damo/multi-modal_hitea_video-captioning_base_en'),
    Tasks.image_portrait_stylization:
    (Pipelines.person_image_cartoon,
     'damo/cv_unet_person-image-cartoon_compound-models'),
@@ -114,6 +117,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.visual_question_answering:
    (Pipelines.visual_question_answering,
     'damo/mplug_visual-question-answering_coco_large_en'),
+    Tasks.video_question_answering:
+    (Pipelines.video_question_answering,
+     'damo/multi-modal_hitea_video-question-answering_base_en'),
    Tasks.video_embedding: (Pipelines.cmdssl_video_embedding,
                            'damo/cv_r2p1d_video_embedding'),
    Tasks.text_to_image_synthesis:
--- a/modelscope/pipelines/multi_modal/init.py
+++ b/modelscope/pipelines/multi_modal/init.py
@@ -14,7 +14,8 @@ if TYPE_CHECKING:
        VideoMultiModalEmbeddingPipeline
    from .visual_question_answering_pipeline import VisualQuestionAnsweringPipeline
    from .asr_pipeline import AutomaticSpeechRecognitionPipeline
-
+    from .video_captioning_pipeline import VideoCaptioningPipeline
+    from .video_question_answering_pipeline import VideoQuestionAnsweringPipeline
 else:
    _import_structure = {
        'image_captioning_pipeline': ['ImageCaptioningPipeline'],
@@ -29,6 +30,9 @@ else:
        'generative_multi_modal_embedding_pipeline':
        ['GEMMMultiModalEmbeddingPipeline'],
        'asr_pipeline': ['AutomaticSpeechRecognitionPipeline'],
+        'video_captioning_pipeline': ['VideoCaptioningPipeline'],
+        'video_question_answering_pipeline':
+        ['VideoQuestionAnsweringPipeline']
    }

    import sys
--- a/modelscope/pipelines/multi_modal/video_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/video_captioning_pipeline.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import HiTeAForAllTasks
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import HiTeAPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_captioning, module_name=Pipelines.video_captioning)
+class VideoCaptioningPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a video captioning pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+        if preprocessor is None:
+            if isinstance(self.model, HiTeAForAllTasks):
+                self.preprocessor = HiTeAPreprocessor(self.model.model_dir)
+
+    def _batch(self, data):
+        if isinstance(self.model, HiTeAForAllTasks):
+            from transformers.tokenization_utils_base import BatchEncoding
+            batch_data = dict(train=data[0]['train'])
+            batch_data['video'] = torch.cat([d['video'] for d in data])
+            question = {}
+            for k in data[0]['question'].keys():
+                question[k] = torch.cat([d['question'][k] for d in data])
+            batch_data['question'] = BatchEncoding(question)
+            return batch_data
+        else:
+            return super()._collate_batch(data)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
--- a/modelscope/pipelines/multi_modal/video_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/video_question_answering_pipeline.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.multi_modal import HiTeAForAllTasks
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import HiTeAPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['VideoQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.video_question_answering,
+    module_name=Pipelines.video_question_answering)
+class VideoQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a video question answering pipeline for prediction
+
+        Args:
+            model (HiTeAForVideoQuestionAnswering): a model instance
+            preprocessor (HiTeAForVideoQuestionAnsweringPreprocessor): a preprocessor instance
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        if preprocessor is None:
+            if isinstance(self.model, HiTeAForAllTasks):
+                self.preprocessor = HiTeAPreprocessor(self.model.model_dir)
+        self.model.eval()
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        return inputs
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -15,7 +15,8 @@ if TYPE_CHECKING:
                        ImageDenoisePreprocessor)
    from .kws import WavToLists
    from .tts import KanttsDataPreprocessor
-    from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
+    from .multi_modal import (OfaPreprocessor, MPlugPreprocessor,
+                              HiTeAPreprocessor)
    from .nlp import (
        DocumentSegmentationTransformersPreprocessor,
        FaqQuestionAnsweringTransformersPreprocessor,
@@ -52,7 +53,8 @@ else:
        ],
        'kws': ['WavToLists'],
        'tts': ['KanttsDataPreprocessor'],
-        'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
+        'multi_modal':
+        ['OfaPreprocessor', 'MPlugPreprocessor', 'HiTeAPreprocessor'],
        'nlp': [
            'DocumentSegmentationTransformersPreprocessor',
            'FaqQuestionAnsweringTransformersPreprocessor',
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -3,7 +3,9 @@ import os.path as osp
 from io import BytesIO
 from typing import Any, Dict, List, Tuple, Union

+import decord
 import json
+import numpy as np
 import torch
 from PIL import Image
 from timm.data import create_transform
@@ -12,6 +14,8 @@ from torchvision.transforms import Compose, Normalize, Resize, ToTensor
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
 from modelscope.pipelines.base import Input
+from modelscope.pipelines.cv.cmdssl_video_embedding_pipeline import (
+    VCenterCrop, VCompose, VNormalize, VRescale, VToTensor)
 from modelscope.preprocessors import load_image
 from modelscope.utils.config import Config
 from modelscope.utils.constant import (Fields, Invoke, ModeKeys, ModelFile,
@@ -22,10 +26,7 @@ from .ofa import *  # noqa
 from .ofa.utils.collate import collate_fn
 from .ofa.utils.constant import OFA_TASK_KEY_MAPPING

-__all__ = [
-    'OfaPreprocessor',
-    'MPlugPreprocessor',
-]
+__all__ = ['OfaPreprocessor', 'MPlugPreprocessor', 'HiTeAPreprocessor']


@PREPROCESSORS.register_module(
@@ -387,3 +388,141 @@ class MPlugPreprocessor(Preprocessor):
            if self.cfg.task == Tasks.image_text_retrieval:
                output['index'] = index
            return output
+
+
+@PREPROCESSORS.register_module(
+    Fields.multi_modal, module_name=Preprocessors.hitea_tasks_preprocessor)
+class HiTeAPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 tokenizer_max_length: int = 25,
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model_dir = model_dir
+        self.mode = mode
+        self.tokenizer_max_length = tokenizer_max_length
+
+        self._tokenizer = None
+        self._patch_resize_transform = None
+        self._num_frames = None
+        self._video_map = {}
+
+    @property
+    def tokenizer(self):
+        from transformers import BertTokenizer
+
+        if self._tokenizer is None:
+            self._tokenizer = BertTokenizer.from_pretrained(self.model_dir)
+        return self._tokenizer
+
+    @property
+    def patch_resize_transform(self):
+        if self._patch_resize_transform is None:
+            from torchvision import transforms
+            from modelscope.models.multi_modal.mplug import CONFIG_NAME, HiTeAConfig
+
+            config = HiTeAConfig.from_yaml_file(
+                osp.join(self.model_dir, CONFIG_NAME))
+
+            mean = (0.48145466, 0.4578275, 0.40821073)
+            std = (0.26862954, 0.26130258, 0.27577711)
+
+            self._patch_resize_transform = transforms.Compose([
+                transforms.Resize((config.image_res, config.image_res),
+                                  interpolation=Image.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=mean, std=std),
+            ])
+        return self._patch_resize_transform
+
+    @property
+    def num_frames(self):
+        if self._num_frames is None:
+            from torchvision import transforms
+            from modelscope.models.multi_modal.mplug import CONFIG_NAME, HiTeAConfig
+
+            config = HiTeAConfig.from_yaml_file(
+                osp.join(self.model_dir, CONFIG_NAME))
+
+            self._num_frames = config.num_frames
+        return self._num_frames
+
+    def video_open(self, path: str) -> Tuple[decord.VideoReader, int]:
+        if path not in self._video_map:
+            index = len(self._video_map)
+            vr = decord.VideoReader(path, ctx=decord.cpu(0))
+            self._video_map[path] = (vr, index)
+        return self._video_map[path]
+
+    def sample_frames(self, num_frames: int, vlen: int) -> List[int]:
+        acc_samples = min(num_frames, vlen)
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(
+            start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+
+        frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+
+        if len(frame_indices) < num_frames:  # padded with last frame
+            padded_frame_indices = [frame_indices[-1]] * num_frames
+            padded_frame_indices[:len(frame_indices)] = frame_indices
+            frame_indices = padded_frame_indices
+        return frame_indices
+
+    def __call__(
+        self, data: Union[decord.VideoReader, tuple,
+                          Dict[str, Any]]) -> Dict[str, Any]:
+        self.cfg = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        if isinstance(data, (decord.VideoReader, str)):
+            video = data
+        elif isinstance(data, tuple):
+            video = data[0]
+        else:
+            video = data['video']
+        index = 0
+        if isinstance(video, str):
+            video, index = self.video_open(video)
+        frame_indices = self.sample_frames(self.num_frames, len(video))
+        video.seek(0)
+        video = torch.from_numpy(video.get_batch(frame_indices).asnumpy())
+        video = [
+            self.patch_resize_transform(Image.fromarray(f))
+            for f in video.numpy()
+        ]
+        video = torch.stack(video, dim=0)
+        question = '' if self.cfg.task == Tasks.video_captioning \
+            else data[1 if isinstance(data, tuple)
+                      else ('text' if 'text' in data else 'question')]
+        question = self.tokenizer(
+            question.lower(),
+            padding='max_length',
+            truncation=True,
+            max_length=self.tokenizer_max_length,
+            return_tensors='pt')
+
+        if self.mode == ModeKeys.INFERENCE:
+            video = torch.stack([video], dim=0)
+            return {'video': video, 'question': question}
+        else:
+            answer = data['answer']
+            answer = self.tokenizer(
+                answer,
+                padding='max_length',
+                truncation=True,
+                max_length=self.tokenizer_max_length,
+                return_tensors='pt')
+            output = {
+                'video': video,
+                'question_input_ids': question.input_ids.squeeze(),
+                'question_attention_mask': question.attention_mask.squeeze(),
+                'answer_input_ids': answer.input_ids.squeeze(),
+                'answer_attention_mask': answer.attention_mask.squeeze(),
+            }
+            return output
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -166,6 +166,8 @@ class MultiModalTasks(object):
    visual_entailment = 'visual-entailment'
    video_multi_modal_embedding = 'video-multi-modal-embedding'
    image_text_retrieval = 'image-text-retrieval'
+    video_captioning = 'video-captioning'
+    video_question_answering = 'video-question-answering'


 class ScienceTasks(object):
--- a/tests/pipelines/test_hitea_tasks.py
+++ b/tests/pipelines/test_hitea_tasks.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class HiTeATasksTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_video_captioning_with_model(self):
+        model = Model.from_pretrained(
+            'damo/multi-modal_hitea_video-captioning_base_en')
+        pipeline_caption = pipeline(
+            task=Tasks.video_captioning,
+            model=model,
+        )
+        video = 'data/test/videos/video_caption_and_qa_test.mp4'
+        result = pipeline_caption(video)
+        print(result[OutputKeys.CAPTION])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_video_captioning_with_name(self):
+        model = 'damo/multi-modal_hitea_video-captioning_base_en'
+        pipeline_caption = pipeline(
+            Tasks.video_captioning,
+            model=model,
+        )
+        video = 'data/test/videos/video_caption_and_qa_test.mp4'
+        result = pipeline_caption(video)
+        print(result[OutputKeys.CAPTION])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_video_question_answering_with_model(self):
+        model = Model.from_pretrained(
+            'damo/multi-modal_hitea_video-question-answering_base_en')
+        pipeline_vqa = pipeline(Tasks.video_question_answering, model=model)
+        video = 'data/test/videos/video_caption_and_qa_test.mp4'
+        text = 'How many people are there?'
+        input = {'video': video, 'text': text}
+        result = pipeline_vqa(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_video_question_answering_with_name(self):
+        model = 'damo/multi-modal_hitea_video-question-answering_base_en'
+        pipeline_vqa = pipeline(Tasks.video_question_answering, model=model)
+        video = 'data/test/videos/video_caption_and_qa_test.mp4'
+        text = 'Who teaches a girl how to paint eggs?'
+        input = {'video': video, 'text': text}
+        result = pipeline_vqa(input)
+        print(result)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()