diff --git a/data/test b/data/test index 91b37f8d..8d062525 160000 --- a/data/test +++ b/data/test @@ -1 +1 @@ -Subproject commit 91b37f8d6251089aa878520f389fbefc2ce9ffc4 +Subproject commit 8d0625256b88bdf41655563049a4a68ec1025638 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index efce05b0..2e3fa39d 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -202,6 +202,7 @@ class Models(object): hitea = 'hitea' soonet = 'soonet' efficient_diffusion_tuning = 'efficient-diffusion-tuning' + mplug_owl = 'mplug-owl' clip_interrogator = 'clip-interrogator' # science models @@ -512,6 +513,7 @@ class Pipelines(object): gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding' soonet_video_temporal_grounding = 'soonet-video-temporal-grounding' efficient_diffusion_tuning = 'efficient-diffusion-tuning' + multimodal_dialogue = 'multimodal-dialogue' # science tasks protein_structure = 'unifold-protein-structure' @@ -1030,6 +1032,7 @@ class Preprocessors(object): vldoc_preprocessor = 'vldoc-preprocessor' hitea_tasks_preprocessor = 'hitea-tasks-preprocessor' diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor' + mplug_owl_preprocessor = 'mplug-owl-preprocessor' image_captioning_clip_interrogator_preprocessor = 'image-captioning-clip-interrogator-preprocessor' # science preprocessor diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py index cfcc0361..9fa34baf 100644 --- a/modelscope/models/multi_modal/__init__.py +++ b/modelscope/models/multi_modal/__init__.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from .vldoc import VLDocForDocVLEmbedding from .video_synthesis import TextToVideoSynthesis from .efficient_diffusion_tuning import EfficientStableDiffusion + from .mplug_owl import MplugOwlForConditionalGeneration from .clip_interrogator import CLIP_Interrogator else: @@ -39,6 +40,7 @@ else: 'vldoc': ['VLDocForDocVLEmbedding'], 'video_synthesis': ['TextToVideoSynthesis'], 'efficient_diffusion_tuning': ['EfficientStableDiffusion'], + 'mplug_owl': ['MplugOwlForConditionalGeneration'], 'clip_interrogator': ['CLIP_Interrogator'], } diff --git a/modelscope/models/multi_modal/mplug_owl/__init__.py b/modelscope/models/multi_modal/mplug_owl/__init__.py new file mode 100644 index 00000000..76ccfb5a --- /dev/null +++ b/modelscope/models/multi_modal/mplug_owl/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2021-2023 The Alibaba DAMO mPLUG Authors. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .configuration_mplug_owl import (MplugOwlConfig, MplugOwlVisionConfig, + MplugOwlVisualAbstractorConfig) +from .modeling_mplug_owl import MplugOwlForConditionalGeneration diff --git a/modelscope/models/multi_modal/mplug_owl/configuration_mplug_owl.py b/modelscope/models/multi_modal/mplug_owl/configuration_mplug_owl.py new file mode 100644 index 00000000..6e32238a --- /dev/null +++ b/modelscope/models/multi_modal/mplug_owl/configuration_mplug_owl.py @@ -0,0 +1,257 @@ +# Copyright 2021-2023 The Alibaba DAMO mPLUG Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" MPLUG OWL model configuration """ +import copy +import os +from typing import Union + +from transformers import PretrainedConfig +from transformers.models.auto import CONFIG_MAPPING +from transformers.utils import logging + +from modelscope.utils.constant import Tasks + +logger = logging.get_logger() + + +class MplugOwlVisionConfig(PretrainedConfig): + r""" + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + ```""" + + model_type = 'mplug_owl_vision_model' + + def __init__( + self, + hidden_size=1024, + intermediate_size=4096, + projection_dim=768, + num_hidden_layers=24, + num_attention_heads=16, + num_channels=3, + image_size=224, + patch_size=14, + hidden_act='quick_gelu', + layer_norm_eps=1e-6, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + use_flash_attn=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.use_flash_attn = use_flash_attn + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, + os.PathLike], + **kwargs) -> 'PretrainedConfig': + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from MplugOwlConfig + if config_dict.get('model_type') == 'mplug_owl': + config_dict = config_dict['vision_config'] + + if 'model_type' in config_dict and hasattr( + cls, + 'model_type') and config_dict['model_type'] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' + ) + + return cls.from_dict(config_dict, **kwargs) + + +class MplugOwlVisualAbstractorConfig(PretrainedConfig): + + model_type = 'MPlugOwlVisualAbstractor' + + def __init__( + self, + hidden_size=1024, + num_hidden_layers=6, + num_attention_heads=16, + intermediate_size=4096, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + layer_norm_eps=1e-6, + encoder_hidden_size=1024, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.encoder_hidden_size = encoder_hidden_size + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, + os.PathLike], + **kwargs) -> 'PretrainedConfig': + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs) + + # get the qformer config dict if we are loading from MplugOwlConfig + if config_dict.get('model_type') == 'mplug_owl': + config_dict = config_dict['abstractor_config'] + + if 'model_type' in config_dict and hasattr( + cls, + 'model_type') and config_dict['model_type'] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' + ) + + return cls.from_dict(config_dict, **kwargs) + + +class MplugOwlConfig(PretrainedConfig): + r""" + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`MplugOwlVisionConfig`]. + qformer_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`MplugOwlVisualAbstractorConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + num_query_tokens (`int`, *optional*, defaults to 32): + The number of query tokens passed through the Transformer. + + kwargs (*optional*): + Dictionary of keyword arguments. + """ + + model_type = 'mplug_owl' + is_composition = True + + def __init__(self, + task=Tasks.multimodal_dialogue, + vision_config=None, + visual_abstractor_config=None, + text_config=None, + num_query_tokens=64, + **kwargs): + + super().__init__(**kwargs) + self.task = task + if vision_config is None: + vision_config = MplugOwlVisionConfig().to_dict() + logger.info('vision_config is None.') + + if visual_abstractor_config is None: + visual_abstractor_config = {} + logger.info('abstractor_config is None. ') + + if text_config is None: + # we use LLAMA 7b by default + from transformers.models.llama.configuration_llama import \ + LlamaConfig + text_config = LlamaConfig(pad_token_id=2).to_dict() + logger.info('text_config is None.') + + self.vision_config = MplugOwlVisionConfig(**vision_config) + self.visual_abstractor_config = MplugOwlVisualAbstractorConfig( + **visual_abstractor_config) + text_model_type = text_config[ + 'model_type'] if 'model_type' in text_config else 'llama' + self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + + self.tie_word_embeddings = self.text_config.tie_word_embeddings + + self.num_query_tokens = num_query_tokens + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + + @classmethod + def from_vision_abstractor_text_configs( + cls, + vision_config: MplugOwlVisionConfig, + visual_abstractor_config: MplugOwlVisualAbstractorConfig, + text_config: PretrainedConfig, + **kwargs, + ): + r""" + Returns: + [`MplugOwlConfig`]: An instance of a configuration object + """ + + return cls( + vision_config=vision_config.to_dict(), + visual_abstractor_config=visual_abstractor_config.to_dict(), + text_config=text_config.to_dict(), + **kwargs, + ) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output['vision_config'] = self.vision_config.to_dict() + tmp = self.visual_abstractor_config.to_dict() + output['visual_abstractor_config'] = tmp + output['text_config'] = self.text_config.to_dict() + output['model_type'] = self.__class__.model_type + return output diff --git a/modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py b/modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py new file mode 100644 index 00000000..21a29185 --- /dev/null +++ b/modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py @@ -0,0 +1,1551 @@ +# Copyright 2021-2023 The Alibaba DAMO mPLUG Team Authors. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch MPLUG OWL model. """ + +import copy +import logging +import math +import os +import os.path as osp +import random +from dataclasses import dataclass +from io import BytesIO +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from torch.nn import CrossEntropyLoss +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions) +from transformers.modeling_utils import (PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer) +from transformers.models.auto import AutoModelForCausalLM +from transformers.utils import ModelOutput + +from modelscope.metainfo import Models +from modelscope.models import TorchModel +from modelscope.models.base import Tensor +from modelscope.models.builder import MODELS +from modelscope.models.multi_modal.mplug_owl.configuration_mplug_owl import ( + MplugOwlConfig, MplugOwlVisionConfig, MplugOwlVisualAbstractorConfig) +from modelscope.outputs import OutputKeys +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks + +__all__ = ['MplugOwlForConditionalGeneration'] + + +@dataclass +class MplugOwlForConditionalGenerationModelOutput(ModelOutput): + """ + Class defining the outputs of [`MPlugOwlForConditionalGeneration`]. + + Args: + loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Language modeling loss from the language model. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head of the language model. + vision_outputs (`BaseModelOutputWithPooling`): + Outputs of the vision encoder. + + language_model_outputs (`CausalLMOutputWithPast`): + Outputs of the language model. + """ + + loss: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + vision_outputs: Optional[torch.FloatTensor] = None + language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ['vision_outputs', 'language_model_outputs' + ] else getattr(self, k).to_tuple() + for k in self.keys()) + + +def get_ltor_masks_and_position_ids_from_embeddings(data): + """Build masks and position id for left to right model.""" + + # Extract batch size and sequence length. + micro_batch_size, seq_length = data.size()[:2] + + # Attention mask (lower triangular). + att_mask_batch = 1 + attention_mask = torch.tril( + torch.ones((att_mask_batch, seq_length, seq_length), + device=data.device)).view(att_mask_batch, 1, seq_length, + seq_length) + + # Loss mask. + loss_mask = torch.ones( + data.size()[:2], dtype=torch.float, device=data.device) + + # Position ids. + position_ids = torch.arange( + seq_length, dtype=torch.long, device=data.device) + position_ids = position_ids.unsqueeze(0).expand_as(data[..., 0]) + + # Convert attention mask to binary: + attention_mask = (attention_mask < 0.5) + + return attention_mask, loss_mask, position_ids + + +class MplugOwlVisionEmbeddings(nn.Module): + + def __init__(self, config: MplugOwlVisionConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.cls_token = nn.Parameter(torch.randn(1, 1, self.hidden_size)) + + self.patch_embed = nn.Conv2d( + in_channels=3, + out_channels=self.hidden_size, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False) + + self.num_patches = (self.image_size // self.patch_size)**2 + + self.position_embedding = nn.Parameter( + torch.randn(1, self.num_patches + 1, self.hidden_size)) + + self.pre_layernorm = LayerNormFp32( + self.hidden_size, eps=config.layer_norm_eps) + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + batch_size = pixel_values.size(0) + image_embeds = self.patch_embed(pixel_values) + image_embeds = image_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.cls_token.expand(batch_size, 1, + -1).to(image_embeds.dtype) + embeddings = torch.cat([class_embeds, image_embeds], dim=1) + embeddings = embeddings + \ + self.position_embedding[:, : embeddings.size(1)].to( + image_embeds.dtype) + embeddings = self.pre_layernorm(embeddings) + return embeddings + + +class LayerNormFp32(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back).""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, x: torch.Tensor): + output = torch.nn.functional.layer_norm( + x.float(), + self.normalized_shape, + self.weight.float() if self.weight is not None else None, + self.bias.float() if self.bias is not None else None, + self.eps, + ) + return output.type_as(x) + + +class MplugOwlVisionAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + if self.head_dim * self.num_heads != self.hidden_size: + raise ValueError( + f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:' + f' {self.num_heads}).') + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + self.query_key_value = nn.Linear(self.hidden_size, + 3 * self.hidden_size) + self.dense = nn.Linear(self.hidden_size, self.hidden_size) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, + self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, seq_len, embed_dim = hidden_states.size() + + mixed_qkv = self.query_key_value(hidden_states) + + mixed_qkv = mixed_qkv.reshape(bsz, seq_len, self.num_heads, 3, + embed_dim // self.num_heads).permute( + 3, 0, 2, 1, 4) # [3, b, np, sq, hn] + query_states, key_states, value_states = ( + mixed_qkv[0], + mixed_qkv[1], + mixed_qkv[2], + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_states, + key_states.transpose(-1, -2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = torch.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, + value_states).permute(0, 2, 1, 3) + + new_context_layer_shape = context_layer.size()[:-2] + ( + self.hidden_size, ) + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.dense(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, + None) + + return outputs + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class MplugOwlMLP(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = QuickGELU() + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class MplugOwlVisionEncoderLayer(nn.Module): + + def __init__(self, config: MplugOwlVisionConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = MplugOwlVisionAttention(config) + self.input_layernorm = LayerNormFp32( + self.hidden_size, eps=config.layer_norm_eps) + self.mlp = MplugOwlMLP(config) + self.post_attention_layernorm = LayerNormFp32( + self.hidden_size, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states, ) + + if output_attentions: + outputs += (attn_weights, ) + + return outputs + + +class MplugOwlPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = MplugOwlConfig + base_model_prefix = 'mplug_owl' + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [ + r'position_ids', + r'language_model.encoder.embed_tokens.weight', + r'language_model.decoder.embed_tokens.weight', + r'language_model.lm_head.weight', + ] + _no_split_modules = ['MplugOwlAttention'] + _keep_in_fp32_modules = ['wo'] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if isinstance(module, nn.Conv2d) or isinstance( + module, nn.Embedding) or isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=factor) + if hasattr(module, 'bias') and module.bias is not None: + module.bias.data.zero_() + + if isinstance(module, MplugOwlVisionEmbeddings): + if hasattr(self.config, 'vision_config'): + factor = self.config.vision_config.initializer_range + nn.init.trunc_normal_( + module.position_embedding, mean=0.0, std=factor) + nn.init.trunc_normal_(module.cls_token, mean=0.0, std=factor) + + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Parameter): + nn.init.trunc_normal_(module.data, mean=0.0, std=factor) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, MplugOwlVisionEncoder): + module.gradient_checkpointing = value + + +MPLUG_OWL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MplugOwlConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +MPLUG_OWL_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`MplugOwlPreprocessor`]. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +MPLUG_OWL_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5 + Training](./t5#training). + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +MPLUG_OWL_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`MplugOwlPreprocessor`]. + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be + provided to serve as text prompt, which the language model can continue. + + Indices can be obtained using [`MplugOwlPreprocessor`]. See [`MplugOwlPreprocessor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an + encoder-decoder language model (like T5) is used. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids) + + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + Only relevant in case an encoder-decoder language model (like T5) is used. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class MplugOwlVisionEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`MplugOwlVisionEncoderLayer`]. + + Args: + config (`MplugOwlVisionConfig`): + The corresponding vision configuration for the `MplugOwlEncoder`. + """ + + def __init__(self, config: MplugOwlVisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([ + MplugOwlVisionEncoderLayer(config) + for _ in range(config.num_hidden_layers) + ]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states, ) + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(encoder_layer), + hidden_states, + attention_mask, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1], ) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states, ) + + if not return_dict: + return tuple( + v for v in [hidden_states, encoder_states, all_attentions] + if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions) + + +class MplugOwlVisionModel(MplugOwlPreTrainedModel): + main_input_name = 'pixel_values' + config_class = MplugOwlVisionConfig + + def __init__(self, config: MplugOwlVisionConfig): + super().__init__(config) + self.config = config + self.hidden_size = config.hidden_size + + self.embeddings = MplugOwlVisionEmbeddings(config) + self.encoder = MplugOwlVisionEncoder(config) + self.post_layernorm = LayerNormFp32( + self.hidden_size, eps=config.layer_norm_eps) + + self.post_init() + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError('You have to specify pixel_values') + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + +class MplugOwlVisualAbstractorMLP(nn.Module): + + def __init__(self, config: MplugOwlVisualAbstractorConfig): + super().__init__() + self.config = config + in_features = config.hidden_size + hidden_features = config.intermediate_size + hidden_features = int(2 * hidden_features / 3) + multiple_of = 256 + hidden_features = multiple_of * \ + ((hidden_features + multiple_of - 1) // multiple_of) + self.act = nn.SiLU() + + self.w1 = nn.Linear(in_features, hidden_features) + self.w2 = nn.Linear(hidden_features, in_features) + self.w3 = nn.Linear(in_features, hidden_features) + self.ffn_ln = LayerNormFp32(hidden_features, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.act( + self.w1(hidden_states)) * self.w3(hidden_states) + hidden_states = self.ffn_ln(hidden_states) + hidden_states = self.w2(hidden_states) + return hidden_states + + +class MplugOwlVisualAbstractorMultiHeadAttention(nn.Module): + + def __init__(self, config: MplugOwlVisualAbstractorConfig): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + 'The hidden size (%d) is not a multiple of the number of attention heads (%d)' + % (config.hidden_size, config.num_attention_heads)) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size + / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / \ + math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if output_attentions else (context_layer, ) + + outputs = outputs + (past_key_value, ) + return outputs + + +class MplugOwlVisualAbstractorCrossOutput(nn.Module): + + def __init__(self, config: MplugOwlVisualAbstractorConfig): + super().__init__() + dim = config.hidden_size + self.out_proj = nn.Linear(dim, dim, bias=True) + self.norm2 = LayerNormFp32(dim) + self.mlp = MplugOwlVisualAbstractorMLP(config) + + def forward(self, hidden_states: torch.Tensor, + input_tensor: torch.Tensor) -> torch.Tensor: + input_tensor = input_tensor + self.out_proj(hidden_states) + input_tensor = input_tensor + self.mlp(self.norm2(input_tensor)) + return input_tensor + + +class MplugOwlVisualAbstractorAttention(nn.Module): + + def __init__(self, config: MplugOwlVisualAbstractorConfig): + super().__init__() + self.attention = MplugOwlVisualAbstractorMultiHeadAttention(config) + self.output = MplugOwlVisualAbstractorCrossOutput(config) + self.pruned_heads = set() + self.norm1 = LayerNormFp32(config.hidden_size) + self.normk = LayerNormFp32(config.hidden_size) + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, + self.attention.attention_head_size, self.pruned_heads) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer( + self.output.out_proj, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - \ + len(heads) + self.attention.all_head_size = self.attention.attention_head_size * \ + self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # HACK we apply norm on q and k + hidden_states = self.norm1(hidden_states) + encoder_hidden_states = self.normk(encoder_hidden_states) + encoder_hidden_states = torch.cat( + [hidden_states, encoder_hidden_states], dim=1) + encoder_attention_mask = torch.cat( + [attention_mask, encoder_attention_mask], dim=-1) + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + # add attentions if we output them + outputs = (attention_output, ) + self_outputs[1:] + return outputs + + +class MplugOwlVisualAbstractorLayer(nn.Module): + + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + + self.layer_idx = layer_idx + + self.crossattention = MplugOwlVisualAbstractorAttention(config) + self.has_cross_attention = True + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + ): + if encoder_hidden_states is None: + raise ValueError( + 'encoder_hidden_states must be given for cross-attention layers' + ) + cross_attention_outputs = self.crossattention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + + outputs = (query_attention_output, ) + return outputs + + +class MplugOwlVisualAbstractorEncoder(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + self.layers = nn.ModuleList([ + MplugOwlVisualAbstractorLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layers[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[ + i] if past_key_values is not None else None + + if getattr(self.config, 'gradient_checkpointing', + False) and self.training: + + def create_custom_forward(module): + + def custom_forward(*inputs): + return module(*inputs, past_key_value, + output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + + hidden_states = layer_outputs[0] + + return BaseModelOutput(last_hidden_state=hidden_states, ) + + +class MplugOwlVisualAbstractorModel(MplugOwlPreTrainedModel): + + def __init__(self, config: MplugOwlVisualAbstractorConfig, + language_hidden_size): + super().__init__(config) + self.config = config + + self.encoder = MplugOwlVisualAbstractorEncoder(config) + self.visual_fc = torch.nn.Linear(config.hidden_size, + language_hidden_size) + self.vit_eos = torch.nn.Parameter( + torch.randn(1, 1, language_hidden_size)) + self.post_init() + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: Tuple[int], + device: torch.device, + ) -> torch.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + device: (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + 'Wrong shape for input_ids (shape {}) or attention_mask (shape {})' + .format(input_shape, attention_mask.shape)) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + query_embeds, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + embedding_output = query_embeds + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask is None: + attention_mask = torch.ones( + (query_embeds.shape[0], query_embeds.shape[1]), + dtype=torch.long, + device=query_embeds.device) + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ + 0].size() + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, + encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) + for mask in encoder_attention_mask + ] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, + self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + sequence_output = self.visual_fc(sequence_output) + eos_repeat = self.vit_eos.repeat(sequence_output.shape[0], 1, 1) + sequence_output = torch.cat([sequence_output, eos_repeat], dim=1) + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +class MplugOwlModel(MplugOwlPreTrainedModel): + r"""The mPLUG-Owl model is a multi-modal conversation model that support various modalities as input. + mPLUG-Owl consists a visual encoder, a visual abstrator module and a language decoder model, which enables + both image and text input. + This model is implemented base on mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. + `Paper `. + """ + config_class = MplugOwlConfig + main_input_name = 'pixel_values' + + def __init__(self, config: MplugOwlConfig): + super().__init__(config) + + self.vision_model = MplugOwlVisionModel(config.vision_config) + + self.query_tokens = nn.Parameter( + torch.zeros(1, config.num_query_tokens, + config.visual_abstractor_config.hidden_size)) + self.abstractor = MplugOwlVisualAbstractorModel( + config.visual_abstractor_config, config.text_config.hidden_size) + + # if config.use_decoder_only_language_model: + language_model = AutoModelForCausalLM.from_config(config.text_config) + self.language_model = language_model + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def get_output_embeddings(self) -> nn.Module: + return self.language_model.get_output_embeddings() + + def get_encoder(self): + return self.language_model.get_encoder() + + def get_decoder(self): + return self.language_model.get_decoder() + + def _tie_weights(self): + if not self.config.use_decoder_only_language_model: + self.language_model.encoder.embed_tokens = self.language_model.shared + self.language_model.decoder.embed_tokens = self.language_model.shared + + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.Tensor] = None, + decoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.use_decoder_only_language_model: + text_outputs = self.language_model( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + else: + inputs_embeds = self.language_model.get_input_embeddings()( + input_ids) + + text_outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + + return text_outputs + + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + return vision_outputs + + +def get_media_indices(my_list): + if isinstance(my_list, torch.Tensor): + my_list = my_list.cpu().tolist() + result = [] + for i in range(len(my_list)): + if i == 0 and my_list[i] < 0: + result.append(i) + elif my_list[i] != my_list[i - 1] and my_list[i] < 0: + result.append(i) + return result + + +class MplugOwlForConditionalGenerationHF(MplugOwlPreTrainedModel): + config_class = MplugOwlConfig + main_input_name = 'pixel_values' + + def __init__(self, config: MplugOwlConfig, **kwargs): + super().__init__(config) + + self.vision_model = MplugOwlVisionModel(config.vision_config) + + self.query_tokens = nn.Parameter( + torch.zeros(1, config.num_query_tokens, + config.visual_abstractor_config.hidden_size)) + self.abstractor = MplugOwlVisualAbstractorModel( + config.visual_abstractor_config, config.text_config.hidden_size) + + # if config.use_decoder_only_language_model: + language_model = AutoModelForCausalLM.from_config(config.text_config) + self.language_model = language_model + + # Initialize weights and apply final processing + self.post_init() + self.main_input_name = 'input_ids' + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def get_output_embeddings(self) -> nn.Module: + return self.language_model.get_output_embeddings() + + def get_encoder(self): + return self.language_model.get_encoder() + + def get_decoder(self): + return self.language_model.get_decoder() + + def _tie_weights(self): + if not self.config.use_decoder_only_language_model: + self.language_model.encoder.embed_tokens = self.language_model.shared + self.language_model.decoder.embed_tokens = self.language_model.shared + + def _preprocess_accelerate(self): + r""" + Some pre-processing hacks to make the model `accelerate` compatible. Check + https://github.com/huggingface/transformers/pull/21707 for more details. + """ + hf_device_map = self.hf_device_map + + if len( + hf_device_map + ) > 1 and 'language_model' not in hf_device_map and torch.cuda.device_count( + ) > 1: + # warn users about unexpected behavior when using multi-GPU + mPLUG-Owl + `accelerate`. + logger.warning( + 'The `language_model` is not in the `hf_device_map` dictionary and you are running your script' + ' in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`.' + ' Please pass a `device_map` that contains `language_model` to remove this warning.' + ' Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for' + ' more details on creating a `device_map` for large models.', ) + + if hasattr(self.language_model, '_hf_hook'): + self.language_model._hf_hook.io_same_device = True # For `generate` compatibility + + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.FloatTensor, + num_images, + non_padding_mask: Optional[torch.LongTensor] = None, + non_media_mask: Optional[torch.LongTensor] = None, + prompt_mask: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, MplugOwlForConditionalGenerationModelOutput]: + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # get text embedding + text_tokens_ = input_ids + batch_size = input_ids.shape[0] + + media_token_indices = [ + # [:-1] since we would not use the last token for embedding + get_media_indices(text_tokens_[i][:-1]) for i in range(batch_size) + ] + text_tokens_[text_tokens_ < 0] = 1 # Not used + text_embeds = self.get_input_embeddings()( + text_tokens_) # Temporally Embedding + + if pixel_values is not None: + pixel_values = pixel_values.half() + image_embeds = self.vision_model( + pixel_values, return_dict=True).last_hidden_state + + image_attention_mask = torch.ones( + image_embeds.size()[:-1], + dtype=torch.long, + device=image_embeds.device) + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, + -1) + + query_features = self.abstractor( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + )['last_hidden_state'] + img_seq_length = query_features.shape[1] + + num_images_per_sample = num_images.long().cpu().tolist() + + text_chunk_embeds = [] + img_idx = 0 + for b in range(batch_size): + start = 0 + result = [] + if len(media_token_indices[b]) > 0: + for i, pos in enumerate(media_token_indices[b]): + if pos > start: + result.append(text_embeds[b, start:pos]) + result.append(query_features[img_idx + i]) + start = pos + img_seq_length + if start < text_embeds.shape[1]: + result.append(text_embeds[b, start:]) + + img_idx += num_images_per_sample[b] + text_chunk_embeds.append(torch.cat(result, dim=0)) + + # Actual Input Embeddings + input_embeds = torch.stack(text_chunk_embeds, dim=0) + + # Create causal mask and position ids + _, loss_mask, position_ids = \ + get_ltor_masks_and_position_ids_from_embeddings(input_embeds) + + # Calculate the loss_mask + non_padding_mask = non_padding_mask.long() + non_media_mask = non_media_mask.long() + prompt_mask = prompt_mask.long() # TODO How to deal with prompt mask + loss_mask = loss_mask[:, :-1] + + loss_mask = loss_mask * non_padding_mask * non_media_mask * prompt_mask + + # Forward into GPT + outputs = self.language_model( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + labels=labels, + ) + outputs.loss = (outputs.loss + * loss_mask.view(-1)).sum() / loss_mask.sum() + return outputs + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + **generate_kwargs, + ) -> torch.LongTensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + + Args: + pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)): + Input images to be processed. + input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt for the generation. + attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices + + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + + if input_ids is not None: + batch_size = input_ids.size(0) + media_token_indices = [ + get_media_indices(input_ids[i]) for i in range(batch_size) + ] + num_images_per_sample = [len(x) for x in media_token_indices] + input_ids = input_ids.clone() + input_ids[input_ids < 0] = 0 # Not used + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids).long().to( + input_ids.device) + + if hasattr(self, 'hf_device_map'): + # preprocess for `accelerate` + self._preprocess_accelerate() + batch_size = input_ids.shape[0] + # get text embedding + inputs_embeds = self.get_input_embeddings()(input_ids) + # get visual embedding + if pixel_values is not None: + pixel_values = pixel_values.half() + pixel_values = pixel_values.to(input_ids.device) + with torch.no_grad(): + image_embeds = self.vision_model( + pixel_values, return_dict=True).last_hidden_state + image_attention_mask = torch.ones( + image_embeds.size()[:-1], + dtype=torch.long, + device=image_embeds.device) + query_tokens = self.query_tokens.expand( + image_embeds.shape[0], -1, -1) + query_outputs = self.abstractor( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs['last_hidden_state'] + image_embeds = query_output + img_seq_length = image_embeds.shape[1] + + # =================== + # Get actual input embeddings + # =================== + text_chunk_embeds = [] + text_chunk_attns = [] + img_idx = 0 + + for b in range(batch_size): + start = 0 + result = [] + result_attn = [] + for i, pos in enumerate(media_token_indices[b]): + if pos > start: + result.append(inputs_embeds[b, start:pos]) + result_attn.append(attention_mask[b, start:pos]) + result.append(image_embeds[img_idx + i]) + result_attn.append( + torch.ones( + image_embeds[img_idx + i].shape[0], + device=inputs_embeds.device)) + start = pos + img_seq_length + if start < inputs_embeds.shape[1]: + result.append(inputs_embeds[b, start:]) + result_attn.append(attention_mask[b, start:]) + + img_idx += num_images_per_sample[b] + text_chunk_embeds.append(torch.cat(result, dim=0)) + text_chunk_attns.append(torch.cat(result_attn, dim=0)) + inputs_embeds = torch.stack(text_chunk_embeds, dim=0) + attention_mask = torch.stack(text_chunk_attns, dim=0) + + outputs = self.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + **generate_kwargs, + ) + + return outputs + + +@MODELS.register_module( + Tasks.multimodal_dialogue, module_name=Models.mplug_owl) +class MplugOwlForConditionalGeneration(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the mPLUG-Owl model from the `model_dir` path. + Args: + model_dir (str): the model path. + """ + + super().__init__(model_dir, *args, **kwargs) + self.model = MplugOwlForConditionalGenerationHF.from_pretrained( + model_dir, + torch_dtype=torch.half, + ) + + def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: + output = self.model.generate(**input) + return output diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index 1b06795a..ab24a34c 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -1369,6 +1369,10 @@ TASK_OUTPUTS = { # {"text": "this is a text answser. "} Tasks.video_question_answering: [OutputKeys.TEXT], + # Multimodal Dialogue result for a sample + # {"text": "this is a text response. "} + Tasks.multimodal_dialogue: [OutputKeys.TEXT], + # auto_speech_recognition result for a single sample # { # "text": "每天都要快乐喔" diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py index 29c5b8d8..8cb031e7 100644 --- a/modelscope/pipeline_inputs.py +++ b/modelscope/pipeline_inputs.py @@ -337,6 +337,9 @@ TASK_INPUTS = { Tasks.video_captioning: [InputType.VIDEO, { 'video': InputType.VIDEO, }], + Tasks.multimodal_dialogue: { + 'messages': InputType.LIST, + }, Tasks.visual_grounding: { 'image': InputType.IMAGE, 'text': InputType.TEXT diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py index 2e496952..b28e9a71 100644 --- a/modelscope/pipelines/multi_modal/__init__.py +++ b/modelscope/pipelines/multi_modal/__init__.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: from .diffusers_wrapped import StableDiffusionWrapperPipeline, ChineseStableDiffusionPipeline from .soonet_video_temporal_grounding_pipeline import SOONetVideoTemporalGroundingPipeline from .text_to_video_synthesis_pipeline import TextToVideoSynthesisPipeline + from .multimodal_dialogue_pipeline import MultimodalDialoguePipeline else: _import_structure = { 'image_captioning_pipeline': ['ImageCaptioningPipeline'], @@ -45,6 +46,7 @@ else: 'soonet_video_temporal_grounding_pipeline': ['SOONetVideoTemporalGroundingPipeline'], 'text_to_video_synthesis_pipeline': ['TextToVideoSynthesisPipeline'], + 'multimodal_dialogue_pipeline': ['MultimodalDialoguePipeline'] } import sys diff --git a/modelscope/pipelines/multi_modal/multimodal_dialogue_pipeline.py b/modelscope/pipelines/multi_modal/multimodal_dialogue_pipeline.py new file mode 100644 index 00000000..31df19fc --- /dev/null +++ b/modelscope/pipelines/multi_modal/multimodal_dialogue_pipeline.py @@ -0,0 +1,90 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, Optional, Union + +import torch + +from modelscope.metainfo import Pipelines +from modelscope.models.multi_modal import MplugOwlForConditionalGeneration +from modelscope.outputs import OutputKeys, TokenGeneratorOutput +from modelscope.pipelines.base import Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import MplugOwlPreprocessor, Preprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.multimodal_dialogue, module_name=Pipelines.multimodal_dialogue) +class MultimodalDialoguePipeline(Pipeline): + r""" Multimodal Dialogue Pipeline. + + Examples: + >>> from modelscope.pipelines import pipeline + >>> chatbot = pipeline('multimodal-dialogue', 'damo/multi-modal_mplug_owl_multimodal-dialogue_7b') + >>> image = 'data/resource/portrait_input.png' + >>> system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.' + >>> system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions." + >>> messages = { + >>> 'messages': [ + >>> { + >>> 'role': 'system', + >>> 'content': system_prompt_1 + ' ' + system_prompt_2 + >>> }, + >>> { + >>> 'role': 'user', + >>> 'content': [{ + >>> 'image': image + >>> }] + >>> }, + >>> { + >>> 'role': 'user', + >>> 'content': 'Describe the facial expression of the man.' + >>> }, + >>> ] + >>> } + >>> chatbot(messages) + >>> { + >>> "text": he is angry. + >>> } + >>> + """ + + def __init__(self, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, + **kwargs): + """ + use `model` and `preprocessor` to create a multimodal dialogue pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() + if preprocessor is None: + if isinstance(self.model, MplugOwlForConditionalGeneration): + self.preprocessor = MplugOwlPreprocessor(self.model.model_dir) + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + """ + the `forward_params` can be the generation configurations listed in transformers library. + """ + with torch.no_grad(): + return super().forward(inputs, **forward_params) + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): _description_ + + Returns: + Dict[str, str]: the prediction results + """ + if isinstance(self.model, MplugOwlForConditionalGeneration): + output = self.preprocessor.tokenizer.decode( + inputs[0], skip_special_tokens=True) + inputs = {OutputKeys.TEXT: output} + return inputs diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 3bbca124..dbcb0813 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -20,7 +20,7 @@ if TYPE_CHECKING: from .tts import KanttsDataPreprocessor from .multi_modal import (DiffusionImageGenerationPreprocessor, OfaPreprocessor, MPlugPreprocessor, - HiTeAPreprocessor, + HiTeAPreprocessor, MplugOwlPreprocessor, ImageCaptioningClipInterrogatorPreprocessor) from .nlp import ( DocumentSegmentationTransformersPreprocessor, @@ -71,7 +71,7 @@ else: 'tts': ['KanttsDataPreprocessor'], 'multi_modal': [ 'DiffusionImageGenerationPreprocessor', 'OfaPreprocessor', - 'MPlugPreprocessor', 'HiTeAPreprocessor', + 'MPlugPreprocessor', 'HiTeAPreprocessor', 'MplugOwlPreprocessor', 'ImageCaptioningClipInterrogatorPreprocessor' ], 'nlp': [ diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index eb7ae339..faf796f4 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -1,5 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp +import re from io import BytesIO from typing import Any, Dict, List, Tuple, Union @@ -29,7 +30,7 @@ from .ofa.utils.constant import OFA_TASK_KEY_MAPPING __all__ = [ 'DiffusionImageGenerationPreprocessor', 'OfaPreprocessor', - 'MPlugPreprocessor', 'HiTeAPreprocessor' + 'MPlugPreprocessor', 'HiTeAPreprocessor', 'MplugOwlPreprocessor' ] @@ -644,6 +645,148 @@ class HiTeAPreprocessor(Preprocessor): return output +@PREPROCESSORS.register_module( + Fields.multi_modal, module_name=Preprocessors.mplug_owl_preprocessor) +class MplugOwlPreprocessor(Preprocessor): + + def __init__(self, + model_dir: str, + mode: str = ModeKeys.INFERENCE, + *args, + **kwargs): + super().__init__(*args, **kwargs) + self.model_dir = model_dir + self.mode = mode + + self._tokenizer = None + self._patch_resize_transform = None + self.media_token = {'': 65} + self._image_map = {} + + @property + def tokenizer(self): + from modelscope.models.nlp.llama import LlamaTokenizer + + if self._tokenizer is None: + self._tokenizer = LlamaTokenizer.from_pretrained(self.model_dir) + return self._tokenizer + + @property + def patch_resize_transform(self): + if self._patch_resize_transform is None: + from torchvision import transforms + + mean = (0.48145466, 0.4578275, 0.40821073) + std = (0.26862954, 0.26130258, 0.27577711) + + self._patch_resize_transform = transforms.Compose([ + transforms.Resize((224, 224), interpolation=Image.BICUBIC), + transforms.ToTensor(), + transforms.Normalize(mean=mean, std=std), + ]) + return self._patch_resize_transform + + def image_open(self, path: str) -> Tuple[Image.Image, int]: + if path not in self._image_map: + index = len(self._image_map) + self._image_map[path] = (load_image(path), index) + return self._image_map[path] + + def tokenize_text(self, text: str) -> List[int]: + media_tokens = { + k: -int(i + 1) + for i, k in enumerate(self.media_token.keys()) + } + media_lengths = self.media_token.copy() + + prompt_chunk = [self.tokenizer.bos_token_id] + + # Pure Text + condition = [ + media_token not in text for media_token in media_tokens.keys() + ] + if all(condition): + enc_chunk = prompt_chunk + \ + self.tokenizer(text, add_special_tokens=False)['input_ids'] + + # Multi-Modal Text + else: + enc_chunk = prompt_chunk + pattern = '|'.join(map(re.escape, list(media_tokens.keys()))) + chunk_strs = re.split(f'({pattern})', text) + chunk_strs = [x for x in chunk_strs if len(x) > 0] + for idx, chunk_str in enumerate(chunk_strs): + if chunk_str in media_tokens: + enc_chunk += [media_tokens[chunk_str]] * \ + media_lengths[chunk_str] + else: + tmp_chunk = self.tokenizer( + chunk_str, add_special_tokens=False)['input_ids'] + enc_chunk += tmp_chunk + return enc_chunk + + def convert(self, messages: Dict[str, List[Dict]]) -> str: + texts = [] + image = [] + messages = messages['messages'] + for turn in messages: + if turn['role'] == 'system': + role = '' + elif turn['role'] == 'user': + role = 'Human: ' + else: + role = 'AI: ' + if isinstance(turn['content'], str): + text = f"{role}{turn['content']}" + texts.append(text) + else: + for t in turn['content']: + if isinstance(t, str): + text = f'{role}{t}' + else: + text = f'{role}' + image.append(t['image']) + texts.append(text) + texts = '\n'.join(texts) + texts += '\nAI: ' + return image, texts + + def __call__(self, messages: Dict[str, Any]) -> Dict[str, Any]: + """ + Args: + messages: {[ + {'role': 'system', 'content': 'message1'}, + {'role': 'user', 'content': 'message2'}, + {'role': 'user', 'content': ['message2', {"image": 'image_path'}, 'message3', ...]}, + ]} + The 'role' should be choose from ['system', 'user', 'assistant']. + The 'content' can be either str or List[Union[str, Dict]] + Return: + output: Dict[str, Tensor] + """ + output = {} + images, text = self.convert(messages) + + if len(images) > 0: + pixel_values = [] + for image in images: + pixel_values.append( + self.patch_resize_transform(self.image_open(image)[0])) + pixel_values = torch.stack(pixel_values, dim=0) + else: + pixel_values = None + + input_ids = self.tokenize_text(text) + input_ids = torch.LongTensor([input_ids]) + + output = { + 'pixel_values': pixel_values, + 'input_ids': input_ids, + } + + return output + + @PREPROCESSORS.register_module( Fields.multi_modal, module_name=Preprocessors.image_captioning_clip_interrogator_preprocessor) diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 929369ec..1f44fc01 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -247,6 +247,7 @@ class MultiModalTasks(object): video_temporal_grounding = 'video-temporal-grounding' text_to_video_synthesis = 'text-to-video-synthesis' efficient_diffusion_tuning = 'efficient-diffusion-tuning' + multimodal_dialogue = 'multimodal-dialogue' class ScienceTasks(object): diff --git a/tests/pipelines/test_clip_interrogator.py b/tests/pipelines/test_clip_interrogator.py index 960db86d..615aef3c 100644 --- a/tests/pipelines/test_clip_interrogator.py +++ b/tests/pipelines/test_clip_interrogator.py @@ -1,17 +1,14 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import unittest -from PIL import Image - from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks -from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level -class CLIPInterrogatorTest(unittest.TestCase, DemoCompatibilityCheck): +class CLIPInterrogatorTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_image_captioning_with_model(self): @@ -32,10 +29,6 @@ class CLIPInterrogatorTest(unittest.TestCase, DemoCompatibilityCheck): result = pipeline_caption(image) print(result[OutputKeys.CAPTION]) - @unittest.skip('demo compatibility test is only enabled on a needed-basis') - def test_demo_compatibility(self): - self.compatibility_check() - if __name__ == '__main__': unittest.main() diff --git a/tests/pipelines/test_face_recognition_onnx_transface.py b/tests/pipelines/test_face_recognition_onnx_transface.py index a41271c1..183257f0 100644 --- a/tests/pipelines/test_face_recognition_onnx_transface.py +++ b/tests/pipelines/test_face_recognition_onnx_transface.py @@ -6,11 +6,10 @@ import numpy as np from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks -from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level -class TransFaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): +class TransFaceRecognitionTest(unittest.TestCase): def setUp(self) -> None: self.task = Tasks.face_recognition @@ -31,10 +30,6 @@ class TransFaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): sim = np.dot(emb1[0], emb2[0]) print(f'Cos similarity={sim:.3f}, img1:{img1} img2:{img2}') - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_demo_compatibility(self): - self.compatibility_check() - if __name__ == '__main__': unittest.main() diff --git a/tests/pipelines/test_fast_instance_segmentation.py b/tests/pipelines/test_fast_instance_segmentation.py index aefd1092..d5789150 100644 --- a/tests/pipelines/test_fast_instance_segmentation.py +++ b/tests/pipelines/test_fast_instance_segmentation.py @@ -5,11 +5,10 @@ from modelscope.models import Model from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks -from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level -class FastInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck): +class FastInstanceSegmentationTest(unittest.TestCase): def setUp(self) -> None: self.task = Tasks.image_segmentation @@ -30,10 +29,6 @@ class FastInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.image_segmentation, model=model, preprocessor=None) print(pipeline_parsing(input=self.image)[OutputKeys.LABELS]) - @unittest.skip('demo compatibility test is only enabled on a needed-basis') - def test_demo_compatibility(self): - self.compatibility_check() - if __name__ == '__main__': unittest.main() diff --git a/tests/pipelines/test_mplug_owl_multimodal_dialogue.py b/tests/pipelines/test_mplug_owl_multimodal_dialogue.py new file mode 100644 index 00000000..57bce67e --- /dev/null +++ b/tests/pipelines/test_mplug_owl_multimodal_dialogue.py @@ -0,0 +1,100 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from PIL import Image + +from modelscope.models import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class MplugOwlMultimodalDialogueTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_with_multimodal_dialogue_with_model(self): + model = Model.from_pretrained( + 'damo/multi-modal_mplug_owl_multimodal-dialogue_7b') + pipeline_multimodal_dialogue = pipeline( + task=Tasks.multimodal_dialogue, + model=model, + ) + image = 'data/resource/portrait_input.png' + system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.' + system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions." + messages = { + 'messages': [ + { + 'role': 'system', + 'content': system_prompt_1 + ' ' + system_prompt_2 + }, + { + 'role': 'user', + 'content': [{ + 'image': image + }] + }, + { + 'role': 'user', + 'content': 'Describe the facial expression of the man.' + }, + ] + } + result = pipeline_multimodal_dialogue(messages) + print(result[OutputKeys.TEXT]) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_multimodal_dialogue_with_name(self): + pipeline_multimodal_dialogue = pipeline( + Tasks.multimodal_dialogue, + model='damo/multi-modal_mplug_owl_multimodal-dialogue_7b') + image = 'data/resource/portrait_input.png' + system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.' + system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions." + messages = { + 'messages': [ + { + 'role': 'system', + 'content': system_prompt_1 + ' ' + system_prompt_2 + }, + { + 'role': 'user', + 'content': [{ + 'image': image + }] + }, + { + 'role': 'user', + 'content': 'Describe the facial expression of the man.' + }, + ] + } + result = pipeline_multimodal_dialogue(messages) + print(result[OutputKeys.TEXT]) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_multimodal_dialogue_with_text(self): + pipeline_multimodal_dialogue = pipeline( + Tasks.multimodal_dialogue, + model='damo/multi-modal_mplug_owl_multimodal-dialogue_7b') + system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.' + system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions." + messages = { + 'messages': [ + { + 'role': 'system', + 'content': system_prompt_1 + ' ' + system_prompt_2 + }, + { + 'role': 'user', + 'content': 'Where is the captial of China?' + }, + ] + } + result = pipeline_multimodal_dialogue(messages) + print(result[OutputKeys.TEXT]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_speaker_verification.py b/tests/pipelines/test_speaker_verification.py index 84a079d2..ae498163 100644 --- a/tests/pipelines/test_speaker_verification.py +++ b/tests/pipelines/test_speaker_verification.py @@ -1,6 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import os.path import unittest from typing import Any, Dict, List, Union @@ -81,10 +80,6 @@ class SpeakerVerificationTest(unittest.TestCase): print(result) self.assertTrue(OutputKeys.TEXT in result) - @unittest.skip('demo compatibility test is only enabled on a needed-basis') - def test_demo_compatibility(self): - self.compatibility_check() - if __name__ == '__main__': unittest.main()