mirror of
https://github.com/modelscope/modelscope.git
synced 2026-05-18 05:05:00 +02:00
Add HiTeA model for VideoQA and Caption (12.30)
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11201652
This commit is contained in:
committed by
yingda.chen
parent
f58060b140
commit
f7a7504782
3
data/test/videos/video_caption_and_qa_test.mp4
Normal file
3
data/test/videos/video_caption_and_qa_test.mp4
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:7c822c66fcf04de28016b224ef372cb1c93b7f13f2cba4e11f53a37fec8e769e
|
||||
size 828272
|
||||
@@ -120,6 +120,7 @@ class Models(object):
|
||||
multi_stage_diffusion = 'multi-stage-diffusion-text-to-image-synthesis'
|
||||
team = 'team-multi-modal-similarity'
|
||||
video_clip = 'video-clip-multi-modal-embedding'
|
||||
hitea = 'hitea'
|
||||
|
||||
# science models
|
||||
unifold = 'unifold'
|
||||
@@ -322,6 +323,8 @@ class Pipelines(object):
|
||||
image_text_retrieval = 'image-text-retrieval'
|
||||
ofa_ocr_recognition = 'ofa-ocr-recognition'
|
||||
ofa_asr = 'ofa-asr'
|
||||
video_captioning = 'video-captioning'
|
||||
video_question_answering = 'video-question-answering'
|
||||
|
||||
# science tasks
|
||||
protein_structure = 'unifold-protein-structure'
|
||||
@@ -446,6 +449,7 @@ class Preprocessors(object):
|
||||
ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
|
||||
clip_preprocessor = 'clip-preprocessor'
|
||||
mplug_tasks_preprocessor = 'mplug-tasks-preprocessor'
|
||||
hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
|
||||
|
||||
# science preprocessor
|
||||
unifold_preprocessor = 'unifold-preprocessor'
|
||||
|
||||
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
||||
from .team import TEAMForMultiModalSimilarity
|
||||
from .diffusion import DiffusionForTextToImageSynthesis
|
||||
from .mmr import VideoCLIPForMultiModalEmbedding
|
||||
from .mplug_for_all_tasks import MPlugForAllTasks
|
||||
from .mplug_for_all_tasks import MPlugForAllTasks, HiTeAForAllTasks
|
||||
from .ofa_for_all_tasks import OfaForAllTasks
|
||||
from .ofa_for_text_to_image_synthesis_model import \
|
||||
OfaForTextToImageSynthesis
|
||||
@@ -24,7 +24,7 @@ else:
|
||||
'gemm': ['GEMMForMultiModalEmbedding'],
|
||||
'team': ['TEAMForMultiModalSimilarity'],
|
||||
'mmr': ['VideoCLIPForMultiModalEmbedding'],
|
||||
'mplug_for_all_tasks': ['MPlugForAllTasks'],
|
||||
'mplug_for_all_tasks': ['MPlugForAllTasks', 'HiTeAForAllTasks'],
|
||||
'ofa_for_all_tasks': ['OfaForAllTasks'],
|
||||
'ofa_for_text_to_image_synthesis_model':
|
||||
['OfaForTextToImageSynthesis'],
|
||||
|
||||
@@ -13,5 +13,5 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .configuration_mplug import MPlugConfig
|
||||
from .modeling_mplug import CONFIG_NAME, MPlug
|
||||
from .configuration_mplug import HiTeAConfig, MPlugConfig
|
||||
from .modeling_mplug import CONFIG_NAME, HiTeA, MPlug
|
||||
|
||||
@@ -114,3 +114,67 @@ class MPlugConfig(PretrainedConfig):
|
||||
with open(yaml_file, 'r', encoding='utf-8') as reader:
|
||||
config_dict = yaml.load(reader, Loader=yaml.Loader)
|
||||
return cls(**config_dict)
|
||||
|
||||
|
||||
class HiTeAConfig(PretrainedConfig):
|
||||
|
||||
model_type = 'hitea'
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task=Tasks.video_question_answering,
|
||||
bert_config='config_bert.json',
|
||||
image_res=224,
|
||||
num_frames=16,
|
||||
batch_size_train=32,
|
||||
vision_width=768,
|
||||
distill=True,
|
||||
batch_size_test=64,
|
||||
k_test=128,
|
||||
alpha=0.4,
|
||||
warm_up=True,
|
||||
eos='[SEP]',
|
||||
optimizer=None,
|
||||
schedular=None,
|
||||
min_length=1,
|
||||
max_length=10,
|
||||
beam_size=5,
|
||||
text_encoder='bert-base-uncased',
|
||||
text_decoder='bert-base-uncased',
|
||||
# retrieval
|
||||
queue_size=65536,
|
||||
embed_dim=256,
|
||||
temp=0.07,
|
||||
**kwargs):
|
||||
|
||||
super().__init__(**kwargs)
|
||||
self.task = task
|
||||
self.bert_config = bert_config
|
||||
self.image_res = image_res
|
||||
self.num_frames = num_frames
|
||||
self.batch_size_train = batch_size_train
|
||||
self.vision_width = vision_width
|
||||
self.distill = distill
|
||||
self.batch_size_test = batch_size_test
|
||||
self.k_test = k_test
|
||||
self.alpha = alpha
|
||||
self.warm_up = warm_up
|
||||
self.eos = eos
|
||||
self.optimizer = optimizer
|
||||
self.schedular = schedular
|
||||
self.min_length = min_length
|
||||
self.max_length = max_length
|
||||
self.beam_size = beam_size
|
||||
self.text_encoder = text_encoder
|
||||
self.text_decoder = text_decoder
|
||||
# retrieval
|
||||
self.queue_size = queue_size
|
||||
self.embed_dim = embed_dim
|
||||
self.temp = temp
|
||||
|
||||
@classmethod
|
||||
def from_yaml_file(cls, yaml_file: Union[str,
|
||||
os.PathLike]) -> Dict[str, Any]:
|
||||
with open(yaml_file, 'r', encoding='utf-8') as reader:
|
||||
config_dict = yaml.load(reader, Loader=yaml.Loader)
|
||||
return cls(**config_dict)
|
||||
|
||||
@@ -40,7 +40,9 @@ from transformers.modeling_utils import (PreTrainedModel,
|
||||
prune_linear_layer)
|
||||
from transformers.utils import logging
|
||||
|
||||
from modelscope.models.multi_modal.mplug.configuration_mplug import MPlugConfig
|
||||
from modelscope.models.multi_modal.mplug.configuration_mplug import (
|
||||
HiTeAConfig, MPlugConfig)
|
||||
from modelscope.models.multi_modal.mplug.mvit import MViTv2, MViTv2_Base_config
|
||||
from modelscope.models.multi_modal.mplug.predictor import TextGenerator
|
||||
from modelscope.utils.constant import ModelFile
|
||||
|
||||
@@ -2483,3 +2485,322 @@ class MPlugForImageTextRetrieval(MPlug):
|
||||
scores = F.softmax(scores, dim=-1)
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
class HiTeA(PreTrainedModel):
|
||||
config_class = HiTeAConfig
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
self.tokenizer = BertTokenizer.from_pretrained(
|
||||
os.path.join(config.model_dir, ModelFile.VOCAB_FILE))
|
||||
self.module_setting(config)
|
||||
self.visual_encoder = MViTv2(
|
||||
img_size=config.image_res,
|
||||
config=MViTv2_Base_config,
|
||||
num_frames=config.num_frames)
|
||||
self.text_encoder = BertModel(
|
||||
self.config_encoder, add_pooling_layer=False)
|
||||
self.fusion_encoder = FusionModel(
|
||||
self.config_fusion, add_pooling_layer=False)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, model_dir, load_checkpoint=True):
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
task_mapping = {
|
||||
Tasks.video_question_answering: HiTeAForVideoQuestionAnswering,
|
||||
Tasks.video_captioning: HiTeAForVideoCaption,
|
||||
}
|
||||
config = cls.config_class.from_yaml_file(
|
||||
os.path.join(model_dir, CONFIG_NAME))
|
||||
config.model_dir = model_dir
|
||||
model = task_mapping[config.task](config)
|
||||
if load_checkpoint:
|
||||
checkpoint_path = os.path.join(model_dir,
|
||||
ModelFile.TORCH_MODEL_BIN_FILE)
|
||||
checkpoint = torch.load(checkpoint_path, map_location='cpu')
|
||||
if 'model' in checkpoint:
|
||||
checkpoint = checkpoint['model']
|
||||
if 'module' in checkpoint:
|
||||
checkpoint = checkpoint['module']
|
||||
checkpoint = {
|
||||
k.replace('model.', ''): v
|
||||
for k, v in checkpoint.items()
|
||||
}
|
||||
|
||||
model.load_state_dict(checkpoint, strict=False)
|
||||
return model
|
||||
|
||||
def init_distill(self, config):
|
||||
self.distill = config.distill
|
||||
if self.distill:
|
||||
self.visual_encoder_m = MViTv2(
|
||||
img_size=config.image_res,
|
||||
config=MViTv2_Base_config,
|
||||
num_frames=config.num_frames)
|
||||
self.text_encoder_m = BertModel(
|
||||
self.config_encoder, add_pooling_layer=False)
|
||||
self.fusion_encoder_m = FusionModel(
|
||||
self.config_fusion, add_pooling_layer=False)
|
||||
self.text_decoder_m = BertLMHeadModel(self.config_decoder)
|
||||
self.model_pairs = [
|
||||
[self.visual_encoder, self.visual_encoder_m],
|
||||
[self.text_encoder, self.text_encoder_m],
|
||||
[self.text_decoder, self.text_decoder_m],
|
||||
]
|
||||
self.copy_params()
|
||||
self.momentum = 0.995
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def module_setting(self, config):
|
||||
bert_config_path = os.path.join(config.model_dir, config.bert_config)
|
||||
self.config_encoder = BertConfig.from_json_file(bert_config_path)
|
||||
self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
|
||||
self.config_fusion = BertConfig.from_json_file(bert_config_path)
|
||||
self.config_decoder = BertConfig.from_json_file(bert_config_path)
|
||||
self.config_decoder.add_cross_attention = True
|
||||
self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
|
||||
|
||||
@torch.no_grad()
|
||||
def copy_params(self):
|
||||
for model_pair in self.model_pairs:
|
||||
for param, param_m in zip(model_pair[0].parameters(),
|
||||
model_pair[1].parameters()):
|
||||
param_m.data.copy_(param.data) # initialize
|
||||
param_m.requires_grad = False # not update by gradient
|
||||
|
||||
@torch.no_grad()
|
||||
def _momentum_update(self):
|
||||
for model_pair in self.model_pairs:
|
||||
for param, param_m in zip(model_pair[0].parameters(),
|
||||
model_pair[1].parameters()):
|
||||
param_m.data = param_m.data * self.momentum + param.data * (
|
||||
1. - self.momentum)
|
||||
|
||||
def generation(self, question_states, question_atts, out_size=1):
|
||||
encoder_inputs = [question_states, question_atts]
|
||||
topk_ids, topk_scores = self.beam_generator.translate_batch(
|
||||
encoder_inputs, out_size=out_size)
|
||||
return topk_ids, topk_scores
|
||||
|
||||
@staticmethod
|
||||
def _tile(x, dim, n_tile):
|
||||
import numpy as np
|
||||
init_dim = x.size(dim)
|
||||
repeat_idx = [1] * x.dim()
|
||||
repeat_idx[dim] = n_tile
|
||||
x = x.repeat(*(repeat_idx))
|
||||
order_index = torch.LongTensor(
|
||||
np.concatenate(
|
||||
[init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
|
||||
return torch.index_select(x, dim, order_index.to(x.device))
|
||||
|
||||
|
||||
class HiTeAForVideoQuestionAnswering(HiTeA):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.text_decoder = BertLMHeadModel(self.config_decoder)
|
||||
self.beam_generator = TextGenerator(config, self.text_decoder)
|
||||
self.init_distill(config)
|
||||
|
||||
def forward(self,
|
||||
video,
|
||||
question,
|
||||
answer=None,
|
||||
alpha=0,
|
||||
k=None,
|
||||
weights=None,
|
||||
train=True):
|
||||
video = video.to(dtype=next(self.parameters()).dtype)
|
||||
video_embeds = self.visual_encoder(video)
|
||||
video_atts = torch.ones(
|
||||
video_embeds.size()[:-1], dtype=torch.long).to(video.device)
|
||||
|
||||
if train:
|
||||
'''
|
||||
k: number of answers for each question
|
||||
weights: weight for each answer
|
||||
'''
|
||||
answer_targets = answer.input_ids.masked_fill(
|
||||
answer.input_ids == self.tokenizer.pad_token_id, -100)
|
||||
text_output = self.text_encoder(
|
||||
question.input_ids,
|
||||
attention_mask=question.attention_mask,
|
||||
return_dict=True)
|
||||
text_embeds = text_output.last_hidden_state
|
||||
fusion_output = self.fusion_encoder(
|
||||
encoder_embeds=text_embeds,
|
||||
attention_mask=question.attention_mask,
|
||||
encoder_hidden_states=video_embeds,
|
||||
encoder_attention_mask=video_atts,
|
||||
return_dict=False)
|
||||
|
||||
video_output, question_output = fusion_output
|
||||
|
||||
question_output = torch.cat([video_output, question_output], 1)
|
||||
merge_text_attention = torch.cat(
|
||||
[video_atts, question.attention_mask], 1)
|
||||
|
||||
if k is None:
|
||||
k = [1] * question_output.shape[0]
|
||||
question_states = []
|
||||
question_atts = []
|
||||
for b, n in enumerate(k):
|
||||
question_states += [question_output[b]] * n
|
||||
question_atts += [merge_text_attention[b]] * n
|
||||
question_states = torch.stack(question_states, 0)
|
||||
question_atts = torch.stack(question_atts, 0)
|
||||
|
||||
if self.distill:
|
||||
with torch.no_grad():
|
||||
self._momentum_update()
|
||||
video_embeds_m = self.visual_encoder_m(video)
|
||||
text_output_m = self.text_encoder_m(
|
||||
question.input_ids,
|
||||
attention_mask=question.attention_mask,
|
||||
return_dict=True)
|
||||
text_embeds_m = text_output_m.last_hidden_state
|
||||
fusion_output_m = self.fusion_encoder_m(
|
||||
encoder_embeds=text_embeds_m,
|
||||
attention_mask=question.attention_mask,
|
||||
encoder_hidden_states=video_embeds_m,
|
||||
encoder_attention_mask=video_atts,
|
||||
return_dict=False)
|
||||
|
||||
image_output_m, question_output_m = fusion_output_m
|
||||
question_output_m = torch.cat(
|
||||
[image_output_m, question_output_m], 1)
|
||||
|
||||
question_states_m = []
|
||||
for b, n in enumerate(k):
|
||||
question_states_m += [question_output_m[b]] * n
|
||||
question_states_m = torch.stack(question_states_m, 0)
|
||||
|
||||
logits_m = self.text_decoder_m(
|
||||
answer.input_ids,
|
||||
attention_mask=answer.attention_mask,
|
||||
encoder_hidden_states=question_states_m,
|
||||
encoder_attention_mask=question_atts,
|
||||
return_logits=True,
|
||||
)
|
||||
|
||||
answer_output = self.text_decoder(
|
||||
answer.input_ids,
|
||||
attention_mask=answer.attention_mask,
|
||||
encoder_hidden_states=question_states,
|
||||
encoder_attention_mask=question_atts,
|
||||
labels=answer_targets,
|
||||
return_dict=True,
|
||||
soft_labels=F.softmax(logits_m, dim=-1),
|
||||
reduction='none',
|
||||
)
|
||||
else:
|
||||
answer_output = self.text_decoder(
|
||||
answer.input_ids,
|
||||
attention_mask=answer.attention_mask,
|
||||
encoder_hidden_states=question_states,
|
||||
encoder_attention_mask=question_atts,
|
||||
labels=answer_targets,
|
||||
return_dict=True,
|
||||
reduction='none',
|
||||
)
|
||||
if weights is None:
|
||||
weights = 1
|
||||
loss = weights * answer_output.loss
|
||||
loss = loss.sum() / video.size(0)
|
||||
|
||||
return loss
|
||||
|
||||
else:
|
||||
text_output = self.text_encoder(
|
||||
question.input_ids,
|
||||
attention_mask=question.attention_mask,
|
||||
return_dict=True)
|
||||
text_embeds = text_output.last_hidden_state
|
||||
fusion_output = self.fusion_encoder(
|
||||
encoder_embeds=text_embeds,
|
||||
attention_mask=question.attention_mask,
|
||||
encoder_hidden_states=video_embeds,
|
||||
encoder_attention_mask=video_atts,
|
||||
return_dict=False)
|
||||
video_output, question_output = fusion_output
|
||||
question_output = torch.cat([video_output, question_output], 1)
|
||||
merge_text_attention = torch.cat(
|
||||
[video_atts, question.attention_mask], 1)
|
||||
topk_ids, topk_probs = self.generation(question_output,
|
||||
merge_text_attention)
|
||||
return topk_ids, topk_probs
|
||||
|
||||
|
||||
class HiTeAForVideoCaption(HiTeA):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.text_decoder = BertPrefixModel(self.config_decoder)
|
||||
self.beam_generator = TextGenerator(config, self.text_decoder)
|
||||
|
||||
def beam_search(self,
|
||||
video,
|
||||
question,
|
||||
answer=None,
|
||||
train=True,
|
||||
out_size=5):
|
||||
video_embeds = self.visual_encoder(video)
|
||||
video_atts = torch.ones(
|
||||
video_embeds.size()[:-1], dtype=torch.long).to(video.device)
|
||||
text_output = self.text_encoder(
|
||||
question.input_ids,
|
||||
attention_mask=question.attention_mask,
|
||||
return_dict=True)
|
||||
text_embeds = text_output.last_hidden_state
|
||||
fusion_output = self.fusion_encoder(
|
||||
encoder_embeds=text_embeds,
|
||||
attention_mask=question.attention_mask,
|
||||
encoder_hidden_states=video_embeds,
|
||||
encoder_attention_mask=video_atts,
|
||||
return_dict=False)
|
||||
video_output, question_output = fusion_output
|
||||
question_output = torch.cat([video_output, question_output], 1)
|
||||
merge_text_attention = torch.cat([video_atts, question.attention_mask],
|
||||
1)
|
||||
topk_ids, topk_probs = self.generation(
|
||||
question_output, merge_text_attention, out_size=out_size)
|
||||
return topk_ids, topk_probs
|
||||
|
||||
def forward(self,
|
||||
video,
|
||||
question,
|
||||
answer=None,
|
||||
train=True,
|
||||
out_size=5,
|
||||
scst=False):
|
||||
if (scst):
|
||||
return self.beam_search(
|
||||
video, question, answer, train=True, out_size=out_size)
|
||||
video = video.to(dtype=next(self.parameters()).dtype)
|
||||
video_embeds = self.visual_encoder(video)
|
||||
video_atts = torch.ones(
|
||||
video_embeds.size()[:-1], dtype=torch.long).to(video.device)
|
||||
|
||||
if train:
|
||||
answer_targets = answer.input_ids.masked_fill(
|
||||
answer.input_ids == self.tokenizer.pad_token_id, -100)
|
||||
answer_output = self.text_decoder(
|
||||
answer.input_ids,
|
||||
attention_mask=answer.attention_mask,
|
||||
encoder_hidden_states=video_embeds,
|
||||
encoder_attention_mask=video_atts,
|
||||
labels=answer_targets,
|
||||
return_dict=True,
|
||||
reduction='none')
|
||||
loss = answer_output.loss
|
||||
|
||||
return loss
|
||||
else:
|
||||
topk_ids, topk_probs = self.generation(video_embeds, video_atts)
|
||||
return topk_ids, topk_probs
|
||||
|
||||
1007
modelscope/models/multi_modal/mplug/mvit.py
Normal file
1007
modelscope/models/multi_modal/mplug/mvit.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -11,7 +11,7 @@ from modelscope.outputs import OutputKeys
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
|
||||
__all__ = ['MPlugForAllTasks']
|
||||
__all__ = ['MPlugForAllTasks', 'HiTeAForAllTasks']
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
@@ -81,3 +81,69 @@ class MPlugForAllTasks(TorchModel):
|
||||
# evaluate
|
||||
topk_ids, _ = output
|
||||
return {'sequences': [list_tensor[0] for list_tensor in topk_ids]}
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.video_question_answering, module_name=Models.hitea)
|
||||
@MODELS.register_module(Tasks.video_captioning, module_name=Models.hitea)
|
||||
class HiTeAForAllTasks(TorchModel):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the hitea model from the `model_dir` path.
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
"""
|
||||
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
from modelscope.models.multi_modal.mplug import HiTeA
|
||||
self.model = HiTeA.from_pretrained(model_dir)
|
||||
self.tokenizer = self.model.tokenizer
|
||||
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Tensor]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, Tensor]: results
|
||||
Example:
|
||||
{
|
||||
'predictions': Tensor([[1377, 4959, 2785, 6392...])]),
|
||||
}
|
||||
"""
|
||||
|
||||
# get task from config file
|
||||
task = Config.from_file(
|
||||
osp.join(self.model_dir, ModelFile.CONFIGURATION)).task
|
||||
|
||||
# inference
|
||||
if not self.training and 'question' in input:
|
||||
output = self.model(input['video'], input['question'], train=False)
|
||||
topk_ids, _ = output
|
||||
pred_string: List[str] = \
|
||||
self.tokenizer.decode(topk_ids[0][0], skip_special_tokens=True)
|
||||
output_key = OutputKeys.CAPTION \
|
||||
if task == Tasks.video_captioning else OutputKeys.TEXT
|
||||
return {output_key: pred_string}
|
||||
|
||||
# train and evaluate
|
||||
import addict
|
||||
video = input['video']
|
||||
answer = addict.Dict(
|
||||
input_ids=input['answer_input_ids'],
|
||||
attention_mask=input['answer_attention_mask'])
|
||||
if 'index' not in input:
|
||||
question = addict.Dict(
|
||||
input_ids=input['question_input_ids'],
|
||||
attention_mask=input['question_attention_mask'])
|
||||
output = self.model(video, question, answer, train=self.training)
|
||||
else:
|
||||
index = input['index']
|
||||
output = self.model(video, answer, index, train=self.training)
|
||||
if self.training:
|
||||
return {OutputKeys.LOSS: output}
|
||||
|
||||
# evaluate
|
||||
topk_ids, _ = output
|
||||
return {'sequences': [list_tensor[0] for list_tensor in topk_ids]}
|
||||
|
||||
@@ -711,6 +711,12 @@ TASK_OUTPUTS = {
|
||||
# "caption": "this is an image caption text."
|
||||
# }
|
||||
Tasks.image_captioning: [OutputKeys.CAPTION],
|
||||
|
||||
# video caption result for single sample
|
||||
# {
|
||||
# "caption": "this is an video caption text."
|
||||
# }
|
||||
Tasks.video_captioning: [OutputKeys.CAPTION],
|
||||
Tasks.ocr_recognition: [OutputKeys.TEXT],
|
||||
|
||||
# visual grounding result for single sample
|
||||
@@ -769,6 +775,10 @@ TASK_OUTPUTS = {
|
||||
# {"text": "this is a text answser. "}
|
||||
Tasks.visual_question_answering: [OutputKeys.TEXT],
|
||||
|
||||
# VideoQA result for a sample
|
||||
# {"text": "this is a text answser. "}
|
||||
Tasks.video_question_answering: [OutputKeys.TEXT],
|
||||
|
||||
# auto_speech_recognition result for a single sample
|
||||
# {
|
||||
# "text": "每天都要快乐喔"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
@@ -222,6 +223,9 @@ TASK_INPUTS = {
|
||||
Tasks.image_captioning: [InputType.IMAGE, {
|
||||
'image': InputType.IMAGE,
|
||||
}],
|
||||
Tasks.video_captioning: [InputType.VIDEO, {
|
||||
'video': InputType.VIDEO,
|
||||
}],
|
||||
Tasks.visual_grounding: {
|
||||
'image': InputType.IMAGE,
|
||||
'text': InputType.TEXT
|
||||
@@ -245,6 +249,10 @@ TASK_INPUTS = {
|
||||
'image': InputType.IMAGE,
|
||||
'text': InputType.TEXT
|
||||
},
|
||||
Tasks.video_question_answering: {
|
||||
'video': InputType.VIDEO,
|
||||
'text': InputType.TEXT
|
||||
},
|
||||
Tasks.visual_entailment: {
|
||||
'image': InputType.IMAGE,
|
||||
'text': InputType.TEXT,
|
||||
|
||||
@@ -80,6 +80,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
|
||||
'damo/nlp_bart_text-error-correction_chinese'),
|
||||
Tasks.image_captioning: (Pipelines.image_captioning,
|
||||
'damo/ofa_image-caption_coco_large_en'),
|
||||
Tasks.video_captioning:
|
||||
(Pipelines.video_captioning,
|
||||
'damo/multi-modal_hitea_video-captioning_base_en'),
|
||||
Tasks.image_portrait_stylization:
|
||||
(Pipelines.person_image_cartoon,
|
||||
'damo/cv_unet_person-image-cartoon_compound-models'),
|
||||
@@ -114,6 +117,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
|
||||
Tasks.visual_question_answering:
|
||||
(Pipelines.visual_question_answering,
|
||||
'damo/mplug_visual-question-answering_coco_large_en'),
|
||||
Tasks.video_question_answering:
|
||||
(Pipelines.video_question_answering,
|
||||
'damo/multi-modal_hitea_video-question-answering_base_en'),
|
||||
Tasks.video_embedding: (Pipelines.cmdssl_video_embedding,
|
||||
'damo/cv_r2p1d_video_embedding'),
|
||||
Tasks.text_to_image_synthesis:
|
||||
|
||||
@@ -14,7 +14,8 @@ if TYPE_CHECKING:
|
||||
VideoMultiModalEmbeddingPipeline
|
||||
from .visual_question_answering_pipeline import VisualQuestionAnsweringPipeline
|
||||
from .asr_pipeline import AutomaticSpeechRecognitionPipeline
|
||||
|
||||
from .video_captioning_pipeline import VideoCaptioningPipeline
|
||||
from .video_question_answering_pipeline import VideoQuestionAnsweringPipeline
|
||||
else:
|
||||
_import_structure = {
|
||||
'image_captioning_pipeline': ['ImageCaptioningPipeline'],
|
||||
@@ -29,6 +30,9 @@ else:
|
||||
'generative_multi_modal_embedding_pipeline':
|
||||
['GEMMMultiModalEmbeddingPipeline'],
|
||||
'asr_pipeline': ['AutomaticSpeechRecognitionPipeline'],
|
||||
'video_captioning_pipeline': ['VideoCaptioningPipeline'],
|
||||
'video_question_answering_pipeline':
|
||||
['VideoQuestionAnsweringPipeline']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from modelscope.metainfo import Pipelines
|
||||
from modelscope.models.multi_modal import HiTeAForAllTasks
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.pipelines.base import Model, Pipeline
|
||||
from modelscope.pipelines.builder import PIPELINES
|
||||
from modelscope.preprocessors import HiTeAPreprocessor, Preprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.video_captioning, module_name=Pipelines.video_captioning)
|
||||
class VideoCaptioningPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[Model, str],
|
||||
preprocessor: Optional[Preprocessor] = None,
|
||||
**kwargs):
|
||||
"""
|
||||
use `model` and `preprocessor` to create a video captioning pipeline for prediction
|
||||
Args:
|
||||
model: model id on modelscope hub.
|
||||
"""
|
||||
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
|
||||
self.model.eval()
|
||||
if preprocessor is None:
|
||||
if isinstance(self.model, HiTeAForAllTasks):
|
||||
self.preprocessor = HiTeAPreprocessor(self.model.model_dir)
|
||||
|
||||
def _batch(self, data):
|
||||
if isinstance(self.model, HiTeAForAllTasks):
|
||||
from transformers.tokenization_utils_base import BatchEncoding
|
||||
batch_data = dict(train=data[0]['train'])
|
||||
batch_data['video'] = torch.cat([d['video'] for d in data])
|
||||
question = {}
|
||||
for k in data[0]['question'].keys():
|
||||
question[k] = torch.cat([d['question'][k] for d in data])
|
||||
batch_data['question'] = BatchEncoding(question)
|
||||
return batch_data
|
||||
else:
|
||||
return super()._collate_batch(data)
|
||||
|
||||
def forward(self, inputs: Dict[str, Any],
|
||||
**forward_params) -> Dict[str, Any]:
|
||||
with torch.no_grad():
|
||||
return super().forward(inputs, **forward_params)
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return inputs
|
||||
@@ -0,0 +1,54 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from modelscope.metainfo import Pipelines
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.multi_modal import HiTeAForAllTasks
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.pipelines.base import Pipeline, Tensor
|
||||
from modelscope.pipelines.builder import PIPELINES
|
||||
from modelscope.preprocessors import HiTeAPreprocessor, Preprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
__all__ = ['VideoQuestionAnsweringPipeline']
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.video_question_answering,
|
||||
module_name=Pipelines.video_question_answering)
|
||||
class VideoQuestionAnsweringPipeline(Pipeline):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[Model, str],
|
||||
preprocessor: Optional[Preprocessor] = None,
|
||||
**kwargs):
|
||||
"""use `model` and `preprocessor` to create a video question answering pipeline for prediction
|
||||
|
||||
Args:
|
||||
model (HiTeAForVideoQuestionAnswering): a model instance
|
||||
preprocessor (HiTeAForVideoQuestionAnsweringPreprocessor): a preprocessor instance
|
||||
"""
|
||||
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
|
||||
if preprocessor is None:
|
||||
if isinstance(self.model, HiTeAForAllTasks):
|
||||
self.preprocessor = HiTeAPreprocessor(self.model.model_dir)
|
||||
self.model.eval()
|
||||
|
||||
def forward(self, inputs: Dict[str, Any],
|
||||
**forward_params) -> Dict[str, Any]:
|
||||
with torch.no_grad():
|
||||
return super().forward(inputs, **forward_params)
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Tensor],
|
||||
**postprocess_params) -> Dict[str, str]:
|
||||
"""process the prediction results
|
||||
|
||||
Args:
|
||||
inputs (Dict[str, Any]): _description_
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: the prediction results
|
||||
"""
|
||||
return inputs
|
||||
@@ -15,7 +15,8 @@ if TYPE_CHECKING:
|
||||
ImageDenoisePreprocessor)
|
||||
from .kws import WavToLists
|
||||
from .tts import KanttsDataPreprocessor
|
||||
from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
|
||||
from .multi_modal import (OfaPreprocessor, MPlugPreprocessor,
|
||||
HiTeAPreprocessor)
|
||||
from .nlp import (
|
||||
DocumentSegmentationTransformersPreprocessor,
|
||||
FaqQuestionAnsweringTransformersPreprocessor,
|
||||
@@ -52,7 +53,8 @@ else:
|
||||
],
|
||||
'kws': ['WavToLists'],
|
||||
'tts': ['KanttsDataPreprocessor'],
|
||||
'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
|
||||
'multi_modal':
|
||||
['OfaPreprocessor', 'MPlugPreprocessor', 'HiTeAPreprocessor'],
|
||||
'nlp': [
|
||||
'DocumentSegmentationTransformersPreprocessor',
|
||||
'FaqQuestionAnsweringTransformersPreprocessor',
|
||||
|
||||
@@ -3,7 +3,9 @@ import os.path as osp
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Tuple, Union
|
||||
|
||||
import decord
|
||||
import json
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from timm.data import create_transform
|
||||
@@ -12,6 +14,8 @@ from torchvision.transforms import Compose, Normalize, Resize, ToTensor
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.metainfo import Preprocessors
|
||||
from modelscope.pipelines.base import Input
|
||||
from modelscope.pipelines.cv.cmdssl_video_embedding_pipeline import (
|
||||
VCenterCrop, VCompose, VNormalize, VRescale, VToTensor)
|
||||
from modelscope.preprocessors import load_image
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.constant import (Fields, Invoke, ModeKeys, ModelFile,
|
||||
@@ -22,10 +26,7 @@ from .ofa import * # noqa
|
||||
from .ofa.utils.collate import collate_fn
|
||||
from .ofa.utils.constant import OFA_TASK_KEY_MAPPING
|
||||
|
||||
__all__ = [
|
||||
'OfaPreprocessor',
|
||||
'MPlugPreprocessor',
|
||||
]
|
||||
__all__ = ['OfaPreprocessor', 'MPlugPreprocessor', 'HiTeAPreprocessor']
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
@@ -387,3 +388,141 @@ class MPlugPreprocessor(Preprocessor):
|
||||
if self.cfg.task == Tasks.image_text_retrieval:
|
||||
output['index'] = index
|
||||
return output
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
Fields.multi_modal, module_name=Preprocessors.hitea_tasks_preprocessor)
|
||||
class HiTeAPreprocessor(Preprocessor):
|
||||
|
||||
def __init__(self,
|
||||
model_dir: str,
|
||||
mode: str = ModeKeys.INFERENCE,
|
||||
tokenizer_max_length: int = 25,
|
||||
*args,
|
||||
**kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
self.mode = mode
|
||||
self.tokenizer_max_length = tokenizer_max_length
|
||||
|
||||
self._tokenizer = None
|
||||
self._patch_resize_transform = None
|
||||
self._num_frames = None
|
||||
self._video_map = {}
|
||||
|
||||
@property
|
||||
def tokenizer(self):
|
||||
from transformers import BertTokenizer
|
||||
|
||||
if self._tokenizer is None:
|
||||
self._tokenizer = BertTokenizer.from_pretrained(self.model_dir)
|
||||
return self._tokenizer
|
||||
|
||||
@property
|
||||
def patch_resize_transform(self):
|
||||
if self._patch_resize_transform is None:
|
||||
from torchvision import transforms
|
||||
from modelscope.models.multi_modal.mplug import CONFIG_NAME, HiTeAConfig
|
||||
|
||||
config = HiTeAConfig.from_yaml_file(
|
||||
osp.join(self.model_dir, CONFIG_NAME))
|
||||
|
||||
mean = (0.48145466, 0.4578275, 0.40821073)
|
||||
std = (0.26862954, 0.26130258, 0.27577711)
|
||||
|
||||
self._patch_resize_transform = transforms.Compose([
|
||||
transforms.Resize((config.image_res, config.image_res),
|
||||
interpolation=Image.BICUBIC),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=mean, std=std),
|
||||
])
|
||||
return self._patch_resize_transform
|
||||
|
||||
@property
|
||||
def num_frames(self):
|
||||
if self._num_frames is None:
|
||||
from torchvision import transforms
|
||||
from modelscope.models.multi_modal.mplug import CONFIG_NAME, HiTeAConfig
|
||||
|
||||
config = HiTeAConfig.from_yaml_file(
|
||||
osp.join(self.model_dir, CONFIG_NAME))
|
||||
|
||||
self._num_frames = config.num_frames
|
||||
return self._num_frames
|
||||
|
||||
def video_open(self, path: str) -> Tuple[decord.VideoReader, int]:
|
||||
if path not in self._video_map:
|
||||
index = len(self._video_map)
|
||||
vr = decord.VideoReader(path, ctx=decord.cpu(0))
|
||||
self._video_map[path] = (vr, index)
|
||||
return self._video_map[path]
|
||||
|
||||
def sample_frames(self, num_frames: int, vlen: int) -> List[int]:
|
||||
acc_samples = min(num_frames, vlen)
|
||||
# split the video into `acc_samples` intervals, and sample from each interval.
|
||||
intervals = np.linspace(
|
||||
start=0, stop=vlen, num=acc_samples + 1).astype(int)
|
||||
ranges = []
|
||||
for idx, interv in enumerate(intervals[:-1]):
|
||||
ranges.append((interv, intervals[idx + 1] - 1))
|
||||
|
||||
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
|
||||
|
||||
if len(frame_indices) < num_frames: # padded with last frame
|
||||
padded_frame_indices = [frame_indices[-1]] * num_frames
|
||||
padded_frame_indices[:len(frame_indices)] = frame_indices
|
||||
frame_indices = padded_frame_indices
|
||||
return frame_indices
|
||||
|
||||
def __call__(
|
||||
self, data: Union[decord.VideoReader, tuple,
|
||||
Dict[str, Any]]) -> Dict[str, Any]:
|
||||
self.cfg = Config.from_file(
|
||||
osp.join(self.model_dir, ModelFile.CONFIGURATION))
|
||||
|
||||
if isinstance(data, (decord.VideoReader, str)):
|
||||
video = data
|
||||
elif isinstance(data, tuple):
|
||||
video = data[0]
|
||||
else:
|
||||
video = data['video']
|
||||
index = 0
|
||||
if isinstance(video, str):
|
||||
video, index = self.video_open(video)
|
||||
frame_indices = self.sample_frames(self.num_frames, len(video))
|
||||
video.seek(0)
|
||||
video = torch.from_numpy(video.get_batch(frame_indices).asnumpy())
|
||||
video = [
|
||||
self.patch_resize_transform(Image.fromarray(f))
|
||||
for f in video.numpy()
|
||||
]
|
||||
video = torch.stack(video, dim=0)
|
||||
question = '' if self.cfg.task == Tasks.video_captioning \
|
||||
else data[1 if isinstance(data, tuple)
|
||||
else ('text' if 'text' in data else 'question')]
|
||||
question = self.tokenizer(
|
||||
question.lower(),
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
max_length=self.tokenizer_max_length,
|
||||
return_tensors='pt')
|
||||
|
||||
if self.mode == ModeKeys.INFERENCE:
|
||||
video = torch.stack([video], dim=0)
|
||||
return {'video': video, 'question': question}
|
||||
else:
|
||||
answer = data['answer']
|
||||
answer = self.tokenizer(
|
||||
answer,
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
max_length=self.tokenizer_max_length,
|
||||
return_tensors='pt')
|
||||
output = {
|
||||
'video': video,
|
||||
'question_input_ids': question.input_ids.squeeze(),
|
||||
'question_attention_mask': question.attention_mask.squeeze(),
|
||||
'answer_input_ids': answer.input_ids.squeeze(),
|
||||
'answer_attention_mask': answer.attention_mask.squeeze(),
|
||||
}
|
||||
return output
|
||||
|
||||
@@ -166,6 +166,8 @@ class MultiModalTasks(object):
|
||||
visual_entailment = 'visual-entailment'
|
||||
video_multi_modal_embedding = 'video-multi-modal-embedding'
|
||||
image_text_retrieval = 'image-text-retrieval'
|
||||
video_captioning = 'video-captioning'
|
||||
video_question_answering = 'video-question-answering'
|
||||
|
||||
|
||||
class ScienceTasks(object):
|
||||
|
||||
64
tests/pipelines/test_hitea_tasks.py
Normal file
64
tests/pipelines/test_hitea_tasks.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import unittest
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.demo_utils import DemoCompatibilityCheck
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class HiTeATasksTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_video_captioning_with_model(self):
|
||||
model = Model.from_pretrained(
|
||||
'damo/multi-modal_hitea_video-captioning_base_en')
|
||||
pipeline_caption = pipeline(
|
||||
task=Tasks.video_captioning,
|
||||
model=model,
|
||||
)
|
||||
video = 'data/test/videos/video_caption_and_qa_test.mp4'
|
||||
result = pipeline_caption(video)
|
||||
print(result[OutputKeys.CAPTION])
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_video_captioning_with_name(self):
|
||||
model = 'damo/multi-modal_hitea_video-captioning_base_en'
|
||||
pipeline_caption = pipeline(
|
||||
Tasks.video_captioning,
|
||||
model=model,
|
||||
)
|
||||
video = 'data/test/videos/video_caption_and_qa_test.mp4'
|
||||
result = pipeline_caption(video)
|
||||
print(result[OutputKeys.CAPTION])
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_video_question_answering_with_model(self):
|
||||
model = Model.from_pretrained(
|
||||
'damo/multi-modal_hitea_video-question-answering_base_en')
|
||||
pipeline_vqa = pipeline(Tasks.video_question_answering, model=model)
|
||||
video = 'data/test/videos/video_caption_and_qa_test.mp4'
|
||||
text = 'How many people are there?'
|
||||
input = {'video': video, 'text': text}
|
||||
result = pipeline_vqa(input)
|
||||
print(result)
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_video_question_answering_with_name(self):
|
||||
model = 'damo/multi-modal_hitea_video-question-answering_base_en'
|
||||
pipeline_vqa = pipeline(Tasks.video_question_answering, model=model)
|
||||
video = 'data/test/videos/video_caption_and_qa_test.mp4'
|
||||
text = 'Who teaches a girl how to paint eggs?'
|
||||
input = {'video': video, 'text': text}
|
||||
result = pipeline_vqa(input)
|
||||
print(result)
|
||||
|
||||
@unittest.skip('demo compatibility test is only enabled on a needed-basis')
|
||||
def test_demo_compatibility(self):
|
||||
self.compatibility_check()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user