Add HiTeA model for VideoQA and Caption (12.30)

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11201652
This commit is contained in:
yeqinghao.yqh
2022-12-29 08:06:34 +08:00
committed by yingda.chen
parent f58060b140
commit f7a7504782
18 changed files with 1823 additions and 13 deletions

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7c822c66fcf04de28016b224ef372cb1c93b7f13f2cba4e11f53a37fec8e769e
size 828272

View File

@@ -120,6 +120,7 @@ class Models(object):
multi_stage_diffusion = 'multi-stage-diffusion-text-to-image-synthesis'
team = 'team-multi-modal-similarity'
video_clip = 'video-clip-multi-modal-embedding'
hitea = 'hitea'
# science models
unifold = 'unifold'
@@ -322,6 +323,8 @@ class Pipelines(object):
image_text_retrieval = 'image-text-retrieval'
ofa_ocr_recognition = 'ofa-ocr-recognition'
ofa_asr = 'ofa-asr'
video_captioning = 'video-captioning'
video_question_answering = 'video-question-answering'
# science tasks
protein_structure = 'unifold-protein-structure'
@@ -446,6 +449,7 @@ class Preprocessors(object):
ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
clip_preprocessor = 'clip-preprocessor'
mplug_tasks_preprocessor = 'mplug-tasks-preprocessor'
hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
# science preprocessor
unifold_preprocessor = 'unifold-preprocessor'

View File

@@ -10,7 +10,7 @@ if TYPE_CHECKING:
from .team import TEAMForMultiModalSimilarity
from .diffusion import DiffusionForTextToImageSynthesis
from .mmr import VideoCLIPForMultiModalEmbedding
from .mplug_for_all_tasks import MPlugForAllTasks
from .mplug_for_all_tasks import MPlugForAllTasks, HiTeAForAllTasks
from .ofa_for_all_tasks import OfaForAllTasks
from .ofa_for_text_to_image_synthesis_model import \
OfaForTextToImageSynthesis
@@ -24,7 +24,7 @@ else:
'gemm': ['GEMMForMultiModalEmbedding'],
'team': ['TEAMForMultiModalSimilarity'],
'mmr': ['VideoCLIPForMultiModalEmbedding'],
'mplug_for_all_tasks': ['MPlugForAllTasks'],
'mplug_for_all_tasks': ['MPlugForAllTasks', 'HiTeAForAllTasks'],
'ofa_for_all_tasks': ['OfaForAllTasks'],
'ofa_for_text_to_image_synthesis_model':
['OfaForTextToImageSynthesis'],

View File

@@ -13,5 +13,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .configuration_mplug import MPlugConfig
from .modeling_mplug import CONFIG_NAME, MPlug
from .configuration_mplug import HiTeAConfig, MPlugConfig
from .modeling_mplug import CONFIG_NAME, HiTeA, MPlug

View File

@@ -114,3 +114,67 @@ class MPlugConfig(PretrainedConfig):
with open(yaml_file, 'r', encoding='utf-8') as reader:
config_dict = yaml.load(reader, Loader=yaml.Loader)
return cls(**config_dict)
class HiTeAConfig(PretrainedConfig):
model_type = 'hitea'
def __init__(
self,
task=Tasks.video_question_answering,
bert_config='config_bert.json',
image_res=224,
num_frames=16,
batch_size_train=32,
vision_width=768,
distill=True,
batch_size_test=64,
k_test=128,
alpha=0.4,
warm_up=True,
eos='[SEP]',
optimizer=None,
schedular=None,
min_length=1,
max_length=10,
beam_size=5,
text_encoder='bert-base-uncased',
text_decoder='bert-base-uncased',
# retrieval
queue_size=65536,
embed_dim=256,
temp=0.07,
**kwargs):
super().__init__(**kwargs)
self.task = task
self.bert_config = bert_config
self.image_res = image_res
self.num_frames = num_frames
self.batch_size_train = batch_size_train
self.vision_width = vision_width
self.distill = distill
self.batch_size_test = batch_size_test
self.k_test = k_test
self.alpha = alpha
self.warm_up = warm_up
self.eos = eos
self.optimizer = optimizer
self.schedular = schedular
self.min_length = min_length
self.max_length = max_length
self.beam_size = beam_size
self.text_encoder = text_encoder
self.text_decoder = text_decoder
# retrieval
self.queue_size = queue_size
self.embed_dim = embed_dim
self.temp = temp
@classmethod
def from_yaml_file(cls, yaml_file: Union[str,
os.PathLike]) -> Dict[str, Any]:
with open(yaml_file, 'r', encoding='utf-8') as reader:
config_dict = yaml.load(reader, Loader=yaml.Loader)
return cls(**config_dict)

View File

@@ -40,7 +40,9 @@ from transformers.modeling_utils import (PreTrainedModel,
prune_linear_layer)
from transformers.utils import logging
from modelscope.models.multi_modal.mplug.configuration_mplug import MPlugConfig
from modelscope.models.multi_modal.mplug.configuration_mplug import (
HiTeAConfig, MPlugConfig)
from modelscope.models.multi_modal.mplug.mvit import MViTv2, MViTv2_Base_config
from modelscope.models.multi_modal.mplug.predictor import TextGenerator
from modelscope.utils.constant import ModelFile
@@ -2483,3 +2485,322 @@ class MPlugForImageTextRetrieval(MPlug):
scores = F.softmax(scores, dim=-1)
return scores
class HiTeA(PreTrainedModel):
config_class = HiTeAConfig
def __init__(self, config):
super().__init__(config)
self.config = config
self.tokenizer = BertTokenizer.from_pretrained(
os.path.join(config.model_dir, ModelFile.VOCAB_FILE))
self.module_setting(config)
self.visual_encoder = MViTv2(
img_size=config.image_res,
config=MViTv2_Base_config,
num_frames=config.num_frames)
self.text_encoder = BertModel(
self.config_encoder, add_pooling_layer=False)
self.fusion_encoder = FusionModel(
self.config_fusion, add_pooling_layer=False)
@classmethod
def from_pretrained(cls, model_dir, load_checkpoint=True):
from modelscope.utils.constant import Tasks
task_mapping = {
Tasks.video_question_answering: HiTeAForVideoQuestionAnswering,
Tasks.video_captioning: HiTeAForVideoCaption,
}
config = cls.config_class.from_yaml_file(
os.path.join(model_dir, CONFIG_NAME))
config.model_dir = model_dir
model = task_mapping[config.task](config)
if load_checkpoint:
checkpoint_path = os.path.join(model_dir,
ModelFile.TORCH_MODEL_BIN_FILE)
checkpoint = torch.load(checkpoint_path, map_location='cpu')
if 'model' in checkpoint:
checkpoint = checkpoint['model']
if 'module' in checkpoint:
checkpoint = checkpoint['module']
checkpoint = {
k.replace('model.', ''): v
for k, v in checkpoint.items()
}
model.load_state_dict(checkpoint, strict=False)
return model
def init_distill(self, config):
self.distill = config.distill
if self.distill:
self.visual_encoder_m = MViTv2(
img_size=config.image_res,
config=MViTv2_Base_config,
num_frames=config.num_frames)
self.text_encoder_m = BertModel(
self.config_encoder, add_pooling_layer=False)
self.fusion_encoder_m = FusionModel(
self.config_fusion, add_pooling_layer=False)
self.text_decoder_m = BertLMHeadModel(self.config_decoder)
self.model_pairs = [
[self.visual_encoder, self.visual_encoder_m],
[self.text_encoder, self.text_encoder_m],
[self.text_decoder, self.text_decoder_m],
]
self.copy_params()
self.momentum = 0.995
def forward(self, *args, **kwargs):
raise NotImplementedError
def module_setting(self, config):
bert_config_path = os.path.join(config.model_dir, config.bert_config)
self.config_encoder = BertConfig.from_json_file(bert_config_path)
self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
self.config_fusion = BertConfig.from_json_file(bert_config_path)
self.config_decoder = BertConfig.from_json_file(bert_config_path)
self.config_decoder.add_cross_attention = True
self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
@torch.no_grad()
def copy_params(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(),
model_pair[1].parameters()):
param_m.data.copy_(param.data) # initialize
param_m.requires_grad = False # not update by gradient
@torch.no_grad()
def _momentum_update(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(),
model_pair[1].parameters()):
param_m.data = param_m.data * self.momentum + param.data * (
1. - self.momentum)
def generation(self, question_states, question_atts, out_size=1):
encoder_inputs = [question_states, question_atts]
topk_ids, topk_scores = self.beam_generator.translate_batch(
encoder_inputs, out_size=out_size)
return topk_ids, topk_scores
@staticmethod
def _tile(x, dim, n_tile):
import numpy as np
init_dim = x.size(dim)
repeat_idx = [1] * x.dim()
repeat_idx[dim] = n_tile
x = x.repeat(*(repeat_idx))
order_index = torch.LongTensor(
np.concatenate(
[init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
return torch.index_select(x, dim, order_index.to(x.device))
class HiTeAForVideoQuestionAnswering(HiTeA):
def __init__(self, config):
super().__init__(config)
self.text_decoder = BertLMHeadModel(self.config_decoder)
self.beam_generator = TextGenerator(config, self.text_decoder)
self.init_distill(config)
def forward(self,
video,
question,
answer=None,
alpha=0,
k=None,
weights=None,
train=True):
video = video.to(dtype=next(self.parameters()).dtype)
video_embeds = self.visual_encoder(video)
video_atts = torch.ones(
video_embeds.size()[:-1], dtype=torch.long).to(video.device)
if train:
'''
k: number of answers for each question
weights: weight for each answer
'''
answer_targets = answer.input_ids.masked_fill(
answer.input_ids == self.tokenizer.pad_token_id, -100)
text_output = self.text_encoder(
question.input_ids,
attention_mask=question.attention_mask,
return_dict=True)
text_embeds = text_output.last_hidden_state
fusion_output = self.fusion_encoder(
encoder_embeds=text_embeds,
attention_mask=question.attention_mask,
encoder_hidden_states=video_embeds,
encoder_attention_mask=video_atts,
return_dict=False)
video_output, question_output = fusion_output
question_output = torch.cat([video_output, question_output], 1)
merge_text_attention = torch.cat(
[video_atts, question.attention_mask], 1)
if k is None:
k = [1] * question_output.shape[0]
question_states = []
question_atts = []
for b, n in enumerate(k):
question_states += [question_output[b]] * n
question_atts += [merge_text_attention[b]] * n
question_states = torch.stack(question_states, 0)
question_atts = torch.stack(question_atts, 0)
if self.distill:
with torch.no_grad():
self._momentum_update()
video_embeds_m = self.visual_encoder_m(video)
text_output_m = self.text_encoder_m(
question.input_ids,
attention_mask=question.attention_mask,
return_dict=True)
text_embeds_m = text_output_m.last_hidden_state
fusion_output_m = self.fusion_encoder_m(
encoder_embeds=text_embeds_m,
attention_mask=question.attention_mask,
encoder_hidden_states=video_embeds_m,
encoder_attention_mask=video_atts,
return_dict=False)
image_output_m, question_output_m = fusion_output_m
question_output_m = torch.cat(
[image_output_m, question_output_m], 1)
question_states_m = []
for b, n in enumerate(k):
question_states_m += [question_output_m[b]] * n
question_states_m = torch.stack(question_states_m, 0)
logits_m = self.text_decoder_m(
answer.input_ids,
attention_mask=answer.attention_mask,
encoder_hidden_states=question_states_m,
encoder_attention_mask=question_atts,
return_logits=True,
)
answer_output = self.text_decoder(
answer.input_ids,
attention_mask=answer.attention_mask,
encoder_hidden_states=question_states,
encoder_attention_mask=question_atts,
labels=answer_targets,
return_dict=True,
soft_labels=F.softmax(logits_m, dim=-1),
reduction='none',
)
else:
answer_output = self.text_decoder(
answer.input_ids,
attention_mask=answer.attention_mask,
encoder_hidden_states=question_states,
encoder_attention_mask=question_atts,
labels=answer_targets,
return_dict=True,
reduction='none',
)
if weights is None:
weights = 1
loss = weights * answer_output.loss
loss = loss.sum() / video.size(0)
return loss
else:
text_output = self.text_encoder(
question.input_ids,
attention_mask=question.attention_mask,
return_dict=True)
text_embeds = text_output.last_hidden_state
fusion_output = self.fusion_encoder(
encoder_embeds=text_embeds,
attention_mask=question.attention_mask,
encoder_hidden_states=video_embeds,
encoder_attention_mask=video_atts,
return_dict=False)
video_output, question_output = fusion_output
question_output = torch.cat([video_output, question_output], 1)
merge_text_attention = torch.cat(
[video_atts, question.attention_mask], 1)
topk_ids, topk_probs = self.generation(question_output,
merge_text_attention)
return topk_ids, topk_probs
class HiTeAForVideoCaption(HiTeA):
def __init__(self, config):
super().__init__(config)
self.text_decoder = BertPrefixModel(self.config_decoder)
self.beam_generator = TextGenerator(config, self.text_decoder)
def beam_search(self,
video,
question,
answer=None,
train=True,
out_size=5):
video_embeds = self.visual_encoder(video)
video_atts = torch.ones(
video_embeds.size()[:-1], dtype=torch.long).to(video.device)
text_output = self.text_encoder(
question.input_ids,
attention_mask=question.attention_mask,
return_dict=True)
text_embeds = text_output.last_hidden_state
fusion_output = self.fusion_encoder(
encoder_embeds=text_embeds,
attention_mask=question.attention_mask,
encoder_hidden_states=video_embeds,
encoder_attention_mask=video_atts,
return_dict=False)
video_output, question_output = fusion_output
question_output = torch.cat([video_output, question_output], 1)
merge_text_attention = torch.cat([video_atts, question.attention_mask],
1)
topk_ids, topk_probs = self.generation(
question_output, merge_text_attention, out_size=out_size)
return topk_ids, topk_probs
def forward(self,
video,
question,
answer=None,
train=True,
out_size=5,
scst=False):
if (scst):
return self.beam_search(
video, question, answer, train=True, out_size=out_size)
video = video.to(dtype=next(self.parameters()).dtype)
video_embeds = self.visual_encoder(video)
video_atts = torch.ones(
video_embeds.size()[:-1], dtype=torch.long).to(video.device)
if train:
answer_targets = answer.input_ids.masked_fill(
answer.input_ids == self.tokenizer.pad_token_id, -100)
answer_output = self.text_decoder(
answer.input_ids,
attention_mask=answer.attention_mask,
encoder_hidden_states=video_embeds,
encoder_attention_mask=video_atts,
labels=answer_targets,
return_dict=True,
reduction='none')
loss = answer_output.loss
return loss
else:
topk_ids, topk_probs = self.generation(video_embeds, video_atts)
return topk_ids, topk_probs

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@ from modelscope.outputs import OutputKeys
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
__all__ = ['MPlugForAllTasks']
__all__ = ['MPlugForAllTasks', 'HiTeAForAllTasks']
@MODELS.register_module(
@@ -81,3 +81,69 @@ class MPlugForAllTasks(TorchModel):
# evaluate
topk_ids, _ = output
return {'sequences': [list_tensor[0] for list_tensor in topk_ids]}
@MODELS.register_module(
Tasks.video_question_answering, module_name=Models.hitea)
@MODELS.register_module(Tasks.video_captioning, module_name=Models.hitea)
class HiTeAForAllTasks(TorchModel):
def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the hitea model from the `model_dir` path.
Args:
model_dir (str): the model path.
"""
super().__init__(model_dir, *args, **kwargs)
from modelscope.models.multi_modal.mplug import HiTeA
self.model = HiTeA.from_pretrained(model_dir)
self.tokenizer = self.model.tokenizer
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
"""return the result by the model
Args:
input (Dict[str, Tensor]): the preprocessed data
Returns:
Dict[str, Tensor]: results
Example:
{
'predictions': Tensor([[1377, 4959, 2785, 6392...])]),
}
"""
# get task from config file
task = Config.from_file(
osp.join(self.model_dir, ModelFile.CONFIGURATION)).task
# inference
if not self.training and 'question' in input:
output = self.model(input['video'], input['question'], train=False)
topk_ids, _ = output
pred_string: List[str] = \
self.tokenizer.decode(topk_ids[0][0], skip_special_tokens=True)
output_key = OutputKeys.CAPTION \
if task == Tasks.video_captioning else OutputKeys.TEXT
return {output_key: pred_string}
# train and evaluate
import addict
video = input['video']
answer = addict.Dict(
input_ids=input['answer_input_ids'],
attention_mask=input['answer_attention_mask'])
if 'index' not in input:
question = addict.Dict(
input_ids=input['question_input_ids'],
attention_mask=input['question_attention_mask'])
output = self.model(video, question, answer, train=self.training)
else:
index = input['index']
output = self.model(video, answer, index, train=self.training)
if self.training:
return {OutputKeys.LOSS: output}
# evaluate
topk_ids, _ = output
return {'sequences': [list_tensor[0] for list_tensor in topk_ids]}

View File

@@ -711,6 +711,12 @@ TASK_OUTPUTS = {
# "caption": "this is an image caption text."
# }
Tasks.image_captioning: [OutputKeys.CAPTION],
# video caption result for single sample
# {
# "caption": "this is an video caption text."
# }
Tasks.video_captioning: [OutputKeys.CAPTION],
Tasks.ocr_recognition: [OutputKeys.TEXT],
# visual grounding result for single sample
@@ -769,6 +775,10 @@ TASK_OUTPUTS = {
# {"text": "this is a text answser. "}
Tasks.visual_question_answering: [OutputKeys.TEXT],
# VideoQA result for a sample
# {"text": "this is a text answser. "}
Tasks.video_question_answering: [OutputKeys.TEXT],
# auto_speech_recognition result for a single sample
# {
# "text": "每天都要快乐喔"

View File

@@ -1,5 +1,6 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import cv2
import numpy as np
from PIL import Image
@@ -222,6 +223,9 @@ TASK_INPUTS = {
Tasks.image_captioning: [InputType.IMAGE, {
'image': InputType.IMAGE,
}],
Tasks.video_captioning: [InputType.VIDEO, {
'video': InputType.VIDEO,
}],
Tasks.visual_grounding: {
'image': InputType.IMAGE,
'text': InputType.TEXT
@@ -245,6 +249,10 @@ TASK_INPUTS = {
'image': InputType.IMAGE,
'text': InputType.TEXT
},
Tasks.video_question_answering: {
'video': InputType.VIDEO,
'text': InputType.TEXT
},
Tasks.visual_entailment: {
'image': InputType.IMAGE,
'text': InputType.TEXT,

View File

@@ -80,6 +80,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
'damo/nlp_bart_text-error-correction_chinese'),
Tasks.image_captioning: (Pipelines.image_captioning,
'damo/ofa_image-caption_coco_large_en'),
Tasks.video_captioning:
(Pipelines.video_captioning,
'damo/multi-modal_hitea_video-captioning_base_en'),
Tasks.image_portrait_stylization:
(Pipelines.person_image_cartoon,
'damo/cv_unet_person-image-cartoon_compound-models'),
@@ -114,6 +117,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.visual_question_answering:
(Pipelines.visual_question_answering,
'damo/mplug_visual-question-answering_coco_large_en'),
Tasks.video_question_answering:
(Pipelines.video_question_answering,
'damo/multi-modal_hitea_video-question-answering_base_en'),
Tasks.video_embedding: (Pipelines.cmdssl_video_embedding,
'damo/cv_r2p1d_video_embedding'),
Tasks.text_to_image_synthesis:

View File

@@ -14,7 +14,8 @@ if TYPE_CHECKING:
VideoMultiModalEmbeddingPipeline
from .visual_question_answering_pipeline import VisualQuestionAnsweringPipeline
from .asr_pipeline import AutomaticSpeechRecognitionPipeline
from .video_captioning_pipeline import VideoCaptioningPipeline
from .video_question_answering_pipeline import VideoQuestionAnsweringPipeline
else:
_import_structure = {
'image_captioning_pipeline': ['ImageCaptioningPipeline'],
@@ -29,6 +30,9 @@ else:
'generative_multi_modal_embedding_pipeline':
['GEMMMultiModalEmbeddingPipeline'],
'asr_pipeline': ['AutomaticSpeechRecognitionPipeline'],
'video_captioning_pipeline': ['VideoCaptioningPipeline'],
'video_question_answering_pipeline':
['VideoQuestionAnsweringPipeline']
}
import sys

View File

@@ -0,0 +1,56 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Any, Dict, Optional, Union
import torch
from modelscope.metainfo import Pipelines
from modelscope.models.multi_modal import HiTeAForAllTasks
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Model, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import HiTeAPreprocessor, Preprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
logger = get_logger()
@PIPELINES.register_module(
Tasks.video_captioning, module_name=Pipelines.video_captioning)
class VideoCaptioningPipeline(Pipeline):
def __init__(self,
model: Union[Model, str],
preprocessor: Optional[Preprocessor] = None,
**kwargs):
"""
use `model` and `preprocessor` to create a video captioning pipeline for prediction
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.model.eval()
if preprocessor is None:
if isinstance(self.model, HiTeAForAllTasks):
self.preprocessor = HiTeAPreprocessor(self.model.model_dir)
def _batch(self, data):
if isinstance(self.model, HiTeAForAllTasks):
from transformers.tokenization_utils_base import BatchEncoding
batch_data = dict(train=data[0]['train'])
batch_data['video'] = torch.cat([d['video'] for d in data])
question = {}
for k in data[0]['question'].keys():
question[k] = torch.cat([d['question'][k] for d in data])
batch_data['question'] = BatchEncoding(question)
return batch_data
else:
return super()._collate_batch(data)
def forward(self, inputs: Dict[str, Any],
**forward_params) -> Dict[str, Any]:
with torch.no_grad():
return super().forward(inputs, **forward_params)
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

View File

@@ -0,0 +1,54 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Any, Dict, Optional, Union
import torch
from modelscope.metainfo import Pipelines
from modelscope.models import Model
from modelscope.models.multi_modal import HiTeAForAllTasks
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Pipeline, Tensor
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import HiTeAPreprocessor, Preprocessor
from modelscope.utils.constant import Tasks
__all__ = ['VideoQuestionAnsweringPipeline']
@PIPELINES.register_module(
Tasks.video_question_answering,
module_name=Pipelines.video_question_answering)
class VideoQuestionAnsweringPipeline(Pipeline):
def __init__(self,
model: Union[Model, str],
preprocessor: Optional[Preprocessor] = None,
**kwargs):
"""use `model` and `preprocessor` to create a video question answering pipeline for prediction
Args:
model (HiTeAForVideoQuestionAnswering): a model instance
preprocessor (HiTeAForVideoQuestionAnsweringPreprocessor): a preprocessor instance
"""
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
if preprocessor is None:
if isinstance(self.model, HiTeAForAllTasks):
self.preprocessor = HiTeAPreprocessor(self.model.model_dir)
self.model.eval()
def forward(self, inputs: Dict[str, Any],
**forward_params) -> Dict[str, Any]:
with torch.no_grad():
return super().forward(inputs, **forward_params)
def postprocess(self, inputs: Dict[str, Tensor],
**postprocess_params) -> Dict[str, str]:
"""process the prediction results
Args:
inputs (Dict[str, Any]): _description_
Returns:
Dict[str, str]: the prediction results
"""
return inputs

View File

@@ -15,7 +15,8 @@ if TYPE_CHECKING:
ImageDenoisePreprocessor)
from .kws import WavToLists
from .tts import KanttsDataPreprocessor
from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
from .multi_modal import (OfaPreprocessor, MPlugPreprocessor,
HiTeAPreprocessor)
from .nlp import (
DocumentSegmentationTransformersPreprocessor,
FaqQuestionAnsweringTransformersPreprocessor,
@@ -52,7 +53,8 @@ else:
],
'kws': ['WavToLists'],
'tts': ['KanttsDataPreprocessor'],
'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
'multi_modal':
['OfaPreprocessor', 'MPlugPreprocessor', 'HiTeAPreprocessor'],
'nlp': [
'DocumentSegmentationTransformersPreprocessor',
'FaqQuestionAnsweringTransformersPreprocessor',

View File

@@ -3,7 +3,9 @@ import os.path as osp
from io import BytesIO
from typing import Any, Dict, List, Tuple, Union
import decord
import json
import numpy as np
import torch
from PIL import Image
from timm.data import create_transform
@@ -12,6 +14,8 @@ from torchvision.transforms import Compose, Normalize, Resize, ToTensor
from modelscope.hub.snapshot_download import snapshot_download
from modelscope.metainfo import Preprocessors
from modelscope.pipelines.base import Input
from modelscope.pipelines.cv.cmdssl_video_embedding_pipeline import (
VCenterCrop, VCompose, VNormalize, VRescale, VToTensor)
from modelscope.preprocessors import load_image
from modelscope.utils.config import Config
from modelscope.utils.constant import (Fields, Invoke, ModeKeys, ModelFile,
@@ -22,10 +26,7 @@ from .ofa import * # noqa
from .ofa.utils.collate import collate_fn
from .ofa.utils.constant import OFA_TASK_KEY_MAPPING
__all__ = [
'OfaPreprocessor',
'MPlugPreprocessor',
]
__all__ = ['OfaPreprocessor', 'MPlugPreprocessor', 'HiTeAPreprocessor']
@PREPROCESSORS.register_module(
@@ -387,3 +388,141 @@ class MPlugPreprocessor(Preprocessor):
if self.cfg.task == Tasks.image_text_retrieval:
output['index'] = index
return output
@PREPROCESSORS.register_module(
Fields.multi_modal, module_name=Preprocessors.hitea_tasks_preprocessor)
class HiTeAPreprocessor(Preprocessor):
def __init__(self,
model_dir: str,
mode: str = ModeKeys.INFERENCE,
tokenizer_max_length: int = 25,
*args,
**kwargs):
super().__init__(*args, **kwargs)
self.model_dir = model_dir
self.mode = mode
self.tokenizer_max_length = tokenizer_max_length
self._tokenizer = None
self._patch_resize_transform = None
self._num_frames = None
self._video_map = {}
@property
def tokenizer(self):
from transformers import BertTokenizer
if self._tokenizer is None:
self._tokenizer = BertTokenizer.from_pretrained(self.model_dir)
return self._tokenizer
@property
def patch_resize_transform(self):
if self._patch_resize_transform is None:
from torchvision import transforms
from modelscope.models.multi_modal.mplug import CONFIG_NAME, HiTeAConfig
config = HiTeAConfig.from_yaml_file(
osp.join(self.model_dir, CONFIG_NAME))
mean = (0.48145466, 0.4578275, 0.40821073)
std = (0.26862954, 0.26130258, 0.27577711)
self._patch_resize_transform = transforms.Compose([
transforms.Resize((config.image_res, config.image_res),
interpolation=Image.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])
return self._patch_resize_transform
@property
def num_frames(self):
if self._num_frames is None:
from torchvision import transforms
from modelscope.models.multi_modal.mplug import CONFIG_NAME, HiTeAConfig
config = HiTeAConfig.from_yaml_file(
osp.join(self.model_dir, CONFIG_NAME))
self._num_frames = config.num_frames
return self._num_frames
def video_open(self, path: str) -> Tuple[decord.VideoReader, int]:
if path not in self._video_map:
index = len(self._video_map)
vr = decord.VideoReader(path, ctx=decord.cpu(0))
self._video_map[path] = (vr, index)
return self._video_map[path]
def sample_frames(self, num_frames: int, vlen: int) -> List[int]:
acc_samples = min(num_frames, vlen)
# split the video into `acc_samples` intervals, and sample from each interval.
intervals = np.linspace(
start=0, stop=vlen, num=acc_samples + 1).astype(int)
ranges = []
for idx, interv in enumerate(intervals[:-1]):
ranges.append((interv, intervals[idx + 1] - 1))
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
if len(frame_indices) < num_frames: # padded with last frame
padded_frame_indices = [frame_indices[-1]] * num_frames
padded_frame_indices[:len(frame_indices)] = frame_indices
frame_indices = padded_frame_indices
return frame_indices
def __call__(
self, data: Union[decord.VideoReader, tuple,
Dict[str, Any]]) -> Dict[str, Any]:
self.cfg = Config.from_file(
osp.join(self.model_dir, ModelFile.CONFIGURATION))
if isinstance(data, (decord.VideoReader, str)):
video = data
elif isinstance(data, tuple):
video = data[0]
else:
video = data['video']
index = 0
if isinstance(video, str):
video, index = self.video_open(video)
frame_indices = self.sample_frames(self.num_frames, len(video))
video.seek(0)
video = torch.from_numpy(video.get_batch(frame_indices).asnumpy())
video = [
self.patch_resize_transform(Image.fromarray(f))
for f in video.numpy()
]
video = torch.stack(video, dim=0)
question = '' if self.cfg.task == Tasks.video_captioning \
else data[1 if isinstance(data, tuple)
else ('text' if 'text' in data else 'question')]
question = self.tokenizer(
question.lower(),
padding='max_length',
truncation=True,
max_length=self.tokenizer_max_length,
return_tensors='pt')
if self.mode == ModeKeys.INFERENCE:
video = torch.stack([video], dim=0)
return {'video': video, 'question': question}
else:
answer = data['answer']
answer = self.tokenizer(
answer,
padding='max_length',
truncation=True,
max_length=self.tokenizer_max_length,
return_tensors='pt')
output = {
'video': video,
'question_input_ids': question.input_ids.squeeze(),
'question_attention_mask': question.attention_mask.squeeze(),
'answer_input_ids': answer.input_ids.squeeze(),
'answer_attention_mask': answer.attention_mask.squeeze(),
}
return output

View File

@@ -166,6 +166,8 @@ class MultiModalTasks(object):
visual_entailment = 'visual-entailment'
video_multi_modal_embedding = 'video-multi-modal-embedding'
image_text_retrieval = 'image-text-retrieval'
video_captioning = 'video-captioning'
video_question_answering = 'video-question-answering'
class ScienceTasks(object):

View File

@@ -0,0 +1,64 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import unittest
from modelscope.models import Model
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.demo_utils import DemoCompatibilityCheck
from modelscope.utils.test_utils import test_level
class HiTeATasksTest(unittest.TestCase, DemoCompatibilityCheck):
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_video_captioning_with_model(self):
model = Model.from_pretrained(
'damo/multi-modal_hitea_video-captioning_base_en')
pipeline_caption = pipeline(
task=Tasks.video_captioning,
model=model,
)
video = 'data/test/videos/video_caption_and_qa_test.mp4'
result = pipeline_caption(video)
print(result[OutputKeys.CAPTION])
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_video_captioning_with_name(self):
model = 'damo/multi-modal_hitea_video-captioning_base_en'
pipeline_caption = pipeline(
Tasks.video_captioning,
model=model,
)
video = 'data/test/videos/video_caption_and_qa_test.mp4'
result = pipeline_caption(video)
print(result[OutputKeys.CAPTION])
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_video_question_answering_with_model(self):
model = Model.from_pretrained(
'damo/multi-modal_hitea_video-question-answering_base_en')
pipeline_vqa = pipeline(Tasks.video_question_answering, model=model)
video = 'data/test/videos/video_caption_and_qa_test.mp4'
text = 'How many people are there?'
input = {'video': video, 'text': text}
result = pipeline_vqa(input)
print(result)
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_video_question_answering_with_name(self):
model = 'damo/multi-modal_hitea_video-question-answering_base_en'
pipeline_vqa = pipeline(Tasks.video_question_answering, model=model)
video = 'data/test/videos/video_caption_and_qa_test.mp4'
text = 'Who teaches a girl how to paint eggs?'
input = {'video': video, 'text': text}
result = pipeline_vqa(input)
print(result)
@unittest.skip('demo compatibility test is only enabled on a needed-basis')
def test_demo_compatibility(self):
self.compatibility_check()
if __name__ == '__main__':
unittest.main()