mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 04:01:10 +01:00
Support mPLUG-Owl model.
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/12610417
This commit is contained in:
committed by
xingjun.wxj
parent
1d28c5b730
commit
b9c8c99776
Submodule data/test updated: 91b37f8d62...8d0625256b
@@ -202,6 +202,7 @@ class Models(object):
|
||||
hitea = 'hitea'
|
||||
soonet = 'soonet'
|
||||
efficient_diffusion_tuning = 'efficient-diffusion-tuning'
|
||||
mplug_owl = 'mplug-owl'
|
||||
clip_interrogator = 'clip-interrogator'
|
||||
|
||||
# science models
|
||||
@@ -512,6 +513,7 @@ class Pipelines(object):
|
||||
gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding'
|
||||
soonet_video_temporal_grounding = 'soonet-video-temporal-grounding'
|
||||
efficient_diffusion_tuning = 'efficient-diffusion-tuning'
|
||||
multimodal_dialogue = 'multimodal-dialogue'
|
||||
|
||||
# science tasks
|
||||
protein_structure = 'unifold-protein-structure'
|
||||
@@ -1030,6 +1032,7 @@ class Preprocessors(object):
|
||||
vldoc_preprocessor = 'vldoc-preprocessor'
|
||||
hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
|
||||
diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor'
|
||||
mplug_owl_preprocessor = 'mplug-owl-preprocessor'
|
||||
image_captioning_clip_interrogator_preprocessor = 'image-captioning-clip-interrogator-preprocessor'
|
||||
|
||||
# science preprocessor
|
||||
|
||||
@@ -20,6 +20,7 @@ if TYPE_CHECKING:
|
||||
from .vldoc import VLDocForDocVLEmbedding
|
||||
from .video_synthesis import TextToVideoSynthesis
|
||||
from .efficient_diffusion_tuning import EfficientStableDiffusion
|
||||
from .mplug_owl import MplugOwlForConditionalGeneration
|
||||
from .clip_interrogator import CLIP_Interrogator
|
||||
|
||||
else:
|
||||
@@ -39,6 +40,7 @@ else:
|
||||
'vldoc': ['VLDocForDocVLEmbedding'],
|
||||
'video_synthesis': ['TextToVideoSynthesis'],
|
||||
'efficient_diffusion_tuning': ['EfficientStableDiffusion'],
|
||||
'mplug_owl': ['MplugOwlForConditionalGeneration'],
|
||||
'clip_interrogator': ['CLIP_Interrogator'],
|
||||
}
|
||||
|
||||
|
||||
18
modelscope/models/multi_modal/mplug_owl/__init__.py
Normal file
18
modelscope/models/multi_modal/mplug_owl/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Copyright 2021-2023 The Alibaba DAMO mPLUG Authors.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .configuration_mplug_owl import (MplugOwlConfig, MplugOwlVisionConfig,
|
||||
MplugOwlVisualAbstractorConfig)
|
||||
from .modeling_mplug_owl import MplugOwlForConditionalGeneration
|
||||
@@ -0,0 +1,257 @@
|
||||
# Copyright 2021-2023 The Alibaba DAMO mPLUG Team Authors.
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" MPLUG OWL model configuration """
|
||||
import copy
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.models.auto import CONFIG_MAPPING
|
||||
from transformers.utils import logging
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
logger = logging.get_logger()
|
||||
|
||||
|
||||
class MplugOwlVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
Args:
|
||||
hidden_size (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
intermediate_size (`int`, *optional*, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
image_size (`int`, *optional*, defaults to 224):
|
||||
The size (resolution) of each image.
|
||||
patch_size (`int`, *optional*, defaults to 32):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
initializer_factor (`float`, *optional*, defaults to 1):
|
||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||
testing).
|
||||
```"""
|
||||
|
||||
model_type = 'mplug_owl_vision_model'
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=1024,
|
||||
intermediate_size=4096,
|
||||
projection_dim=768,
|
||||
num_hidden_layers=24,
|
||||
num_attention_heads=16,
|
||||
num_channels=3,
|
||||
image_size=224,
|
||||
patch_size=14,
|
||||
hidden_act='quick_gelu',
|
||||
layer_norm_eps=1e-6,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
initializer_factor=1.0,
|
||||
use_flash_attn=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.projection_dim = projection_dim
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
self.use_flash_attn = use_flash_attn
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
|
||||
os.PathLike],
|
||||
**kwargs) -> 'PretrainedConfig':
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from MplugOwlConfig
|
||||
if config_dict.get('model_type') == 'mplug_owl':
|
||||
config_dict = config_dict['vision_config']
|
||||
|
||||
if 'model_type' in config_dict and hasattr(
|
||||
cls,
|
||||
'model_type') and config_dict['model_type'] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class MplugOwlVisualAbstractorConfig(PretrainedConfig):
|
||||
|
||||
model_type = 'MPlugOwlVisualAbstractor'
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=1024,
|
||||
num_hidden_layers=6,
|
||||
num_attention_heads=16,
|
||||
intermediate_size=4096,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-6,
|
||||
encoder_hidden_size=1024,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.encoder_hidden_size = encoder_hidden_size
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
|
||||
os.PathLike],
|
||||
**kwargs) -> 'PretrainedConfig':
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the qformer config dict if we are loading from MplugOwlConfig
|
||||
if config_dict.get('model_type') == 'mplug_owl':
|
||||
config_dict = config_dict['abstractor_config']
|
||||
|
||||
if 'model_type' in config_dict and hasattr(
|
||||
cls,
|
||||
'model_type') and config_dict['model_type'] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class MplugOwlConfig(PretrainedConfig):
|
||||
r"""
|
||||
Args:
|
||||
vision_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize [`MplugOwlVisionConfig`].
|
||||
qformer_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize [`MplugOwlVisualAbstractorConfig`].
|
||||
text_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize any [`PretrainedConfig`].
|
||||
num_query_tokens (`int`, *optional*, defaults to 32):
|
||||
The number of query tokens passed through the Transformer.
|
||||
|
||||
kwargs (*optional*):
|
||||
Dictionary of keyword arguments.
|
||||
"""
|
||||
|
||||
model_type = 'mplug_owl'
|
||||
is_composition = True
|
||||
|
||||
def __init__(self,
|
||||
task=Tasks.multimodal_dialogue,
|
||||
vision_config=None,
|
||||
visual_abstractor_config=None,
|
||||
text_config=None,
|
||||
num_query_tokens=64,
|
||||
**kwargs):
|
||||
|
||||
super().__init__(**kwargs)
|
||||
self.task = task
|
||||
if vision_config is None:
|
||||
vision_config = MplugOwlVisionConfig().to_dict()
|
||||
logger.info('vision_config is None.')
|
||||
|
||||
if visual_abstractor_config is None:
|
||||
visual_abstractor_config = {}
|
||||
logger.info('abstractor_config is None. ')
|
||||
|
||||
if text_config is None:
|
||||
# we use LLAMA 7b by default
|
||||
from transformers.models.llama.configuration_llama import \
|
||||
LlamaConfig
|
||||
text_config = LlamaConfig(pad_token_id=2).to_dict()
|
||||
logger.info('text_config is None.')
|
||||
|
||||
self.vision_config = MplugOwlVisionConfig(**vision_config)
|
||||
self.visual_abstractor_config = MplugOwlVisualAbstractorConfig(
|
||||
**visual_abstractor_config)
|
||||
text_model_type = text_config[
|
||||
'model_type'] if 'model_type' in text_config else 'llama'
|
||||
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
|
||||
|
||||
self.tie_word_embeddings = self.text_config.tie_word_embeddings
|
||||
|
||||
self.num_query_tokens = num_query_tokens
|
||||
self.initializer_factor = 1.0
|
||||
self.initializer_range = 0.02
|
||||
|
||||
@classmethod
|
||||
def from_vision_abstractor_text_configs(
|
||||
cls,
|
||||
vision_config: MplugOwlVisionConfig,
|
||||
visual_abstractor_config: MplugOwlVisualAbstractorConfig,
|
||||
text_config: PretrainedConfig,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Returns:
|
||||
[`MplugOwlConfig`]: An instance of a configuration object
|
||||
"""
|
||||
|
||||
return cls(
|
||||
vision_config=vision_config.to_dict(),
|
||||
visual_abstractor_config=visual_abstractor_config.to_dict(),
|
||||
text_config=text_config.to_dict(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
"""
|
||||
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
|
||||
|
||||
Returns:
|
||||
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
output['vision_config'] = self.vision_config.to_dict()
|
||||
tmp = self.visual_abstractor_config.to_dict()
|
||||
output['visual_abstractor_config'] = tmp
|
||||
output['text_config'] = self.text_config.to_dict()
|
||||
output['model_type'] = self.__class__.model_type
|
||||
return output
|
||||
1551
modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
Normal file
1551
modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1369,6 +1369,10 @@ TASK_OUTPUTS = {
|
||||
# {"text": "this is a text answser. "}
|
||||
Tasks.video_question_answering: [OutputKeys.TEXT],
|
||||
|
||||
# Multimodal Dialogue result for a sample
|
||||
# {"text": "this is a text response. "}
|
||||
Tasks.multimodal_dialogue: [OutputKeys.TEXT],
|
||||
|
||||
# auto_speech_recognition result for a single sample
|
||||
# {
|
||||
# "text": "每天都要快乐喔"
|
||||
|
||||
@@ -337,6 +337,9 @@ TASK_INPUTS = {
|
||||
Tasks.video_captioning: [InputType.VIDEO, {
|
||||
'video': InputType.VIDEO,
|
||||
}],
|
||||
Tasks.multimodal_dialogue: {
|
||||
'messages': InputType.LIST,
|
||||
},
|
||||
Tasks.visual_grounding: {
|
||||
'image': InputType.IMAGE,
|
||||
'text': InputType.TEXT
|
||||
|
||||
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
|
||||
from .diffusers_wrapped import StableDiffusionWrapperPipeline, ChineseStableDiffusionPipeline
|
||||
from .soonet_video_temporal_grounding_pipeline import SOONetVideoTemporalGroundingPipeline
|
||||
from .text_to_video_synthesis_pipeline import TextToVideoSynthesisPipeline
|
||||
from .multimodal_dialogue_pipeline import MultimodalDialoguePipeline
|
||||
else:
|
||||
_import_structure = {
|
||||
'image_captioning_pipeline': ['ImageCaptioningPipeline'],
|
||||
@@ -45,6 +46,7 @@ else:
|
||||
'soonet_video_temporal_grounding_pipeline':
|
||||
['SOONetVideoTemporalGroundingPipeline'],
|
||||
'text_to_video_synthesis_pipeline': ['TextToVideoSynthesisPipeline'],
|
||||
'multimodal_dialogue_pipeline': ['MultimodalDialoguePipeline']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from modelscope.metainfo import Pipelines
|
||||
from modelscope.models.multi_modal import MplugOwlForConditionalGeneration
|
||||
from modelscope.outputs import OutputKeys, TokenGeneratorOutput
|
||||
from modelscope.pipelines.base import Model, Pipeline
|
||||
from modelscope.pipelines.builder import PIPELINES
|
||||
from modelscope.preprocessors import MplugOwlPreprocessor, Preprocessor
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.multimodal_dialogue, module_name=Pipelines.multimodal_dialogue)
|
||||
class MultimodalDialoguePipeline(Pipeline):
|
||||
r""" Multimodal Dialogue Pipeline.
|
||||
|
||||
Examples:
|
||||
>>> from modelscope.pipelines import pipeline
|
||||
>>> chatbot = pipeline('multimodal-dialogue', 'damo/multi-modal_mplug_owl_multimodal-dialogue_7b')
|
||||
>>> image = 'data/resource/portrait_input.png'
|
||||
>>> system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.'
|
||||
>>> system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions."
|
||||
>>> messages = {
|
||||
>>> 'messages': [
|
||||
>>> {
|
||||
>>> 'role': 'system',
|
||||
>>> 'content': system_prompt_1 + ' ' + system_prompt_2
|
||||
>>> },
|
||||
>>> {
|
||||
>>> 'role': 'user',
|
||||
>>> 'content': [{
|
||||
>>> 'image': image
|
||||
>>> }]
|
||||
>>> },
|
||||
>>> {
|
||||
>>> 'role': 'user',
|
||||
>>> 'content': 'Describe the facial expression of the man.'
|
||||
>>> },
|
||||
>>> ]
|
||||
>>> }
|
||||
>>> chatbot(messages)
|
||||
>>> {
|
||||
>>> "text": he is angry.
|
||||
>>> }
|
||||
>>>
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
model: Union[Model, str],
|
||||
preprocessor: Optional[Preprocessor] = None,
|
||||
**kwargs):
|
||||
"""
|
||||
use `model` and `preprocessor` to create a multimodal dialogue pipeline for prediction
|
||||
Args:
|
||||
model: model id on modelscope hub.
|
||||
"""
|
||||
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
|
||||
self.model.eval()
|
||||
if preprocessor is None:
|
||||
if isinstance(self.model, MplugOwlForConditionalGeneration):
|
||||
self.preprocessor = MplugOwlPreprocessor(self.model.model_dir)
|
||||
|
||||
def forward(self, inputs: Dict[str, Any],
|
||||
**forward_params) -> Dict[str, Any]:
|
||||
"""
|
||||
the `forward_params` can be the generation configurations listed in transformers library.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
return super().forward(inputs, **forward_params)
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""process the prediction results
|
||||
|
||||
Args:
|
||||
inputs (Dict[str, Any]): _description_
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: the prediction results
|
||||
"""
|
||||
if isinstance(self.model, MplugOwlForConditionalGeneration):
|
||||
output = self.preprocessor.tokenizer.decode(
|
||||
inputs[0], skip_special_tokens=True)
|
||||
inputs = {OutputKeys.TEXT: output}
|
||||
return inputs
|
||||
@@ -20,7 +20,7 @@ if TYPE_CHECKING:
|
||||
from .tts import KanttsDataPreprocessor
|
||||
from .multi_modal import (DiffusionImageGenerationPreprocessor,
|
||||
OfaPreprocessor, MPlugPreprocessor,
|
||||
HiTeAPreprocessor,
|
||||
HiTeAPreprocessor, MplugOwlPreprocessor,
|
||||
ImageCaptioningClipInterrogatorPreprocessor)
|
||||
from .nlp import (
|
||||
DocumentSegmentationTransformersPreprocessor,
|
||||
@@ -71,7 +71,7 @@ else:
|
||||
'tts': ['KanttsDataPreprocessor'],
|
||||
'multi_modal': [
|
||||
'DiffusionImageGenerationPreprocessor', 'OfaPreprocessor',
|
||||
'MPlugPreprocessor', 'HiTeAPreprocessor',
|
||||
'MPlugPreprocessor', 'HiTeAPreprocessor', 'MplugOwlPreprocessor',
|
||||
'ImageCaptioningClipInterrogatorPreprocessor'
|
||||
],
|
||||
'nlp': [
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os.path as osp
|
||||
import re
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Tuple, Union
|
||||
|
||||
@@ -29,7 +30,7 @@ from .ofa.utils.constant import OFA_TASK_KEY_MAPPING
|
||||
|
||||
__all__ = [
|
||||
'DiffusionImageGenerationPreprocessor', 'OfaPreprocessor',
|
||||
'MPlugPreprocessor', 'HiTeAPreprocessor'
|
||||
'MPlugPreprocessor', 'HiTeAPreprocessor', 'MplugOwlPreprocessor'
|
||||
]
|
||||
|
||||
|
||||
@@ -644,6 +645,148 @@ class HiTeAPreprocessor(Preprocessor):
|
||||
return output
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
Fields.multi_modal, module_name=Preprocessors.mplug_owl_preprocessor)
|
||||
class MplugOwlPreprocessor(Preprocessor):
|
||||
|
||||
def __init__(self,
|
||||
model_dir: str,
|
||||
mode: str = ModeKeys.INFERENCE,
|
||||
*args,
|
||||
**kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
self.mode = mode
|
||||
|
||||
self._tokenizer = None
|
||||
self._patch_resize_transform = None
|
||||
self.media_token = {'<image>': 65}
|
||||
self._image_map = {}
|
||||
|
||||
@property
|
||||
def tokenizer(self):
|
||||
from modelscope.models.nlp.llama import LlamaTokenizer
|
||||
|
||||
if self._tokenizer is None:
|
||||
self._tokenizer = LlamaTokenizer.from_pretrained(self.model_dir)
|
||||
return self._tokenizer
|
||||
|
||||
@property
|
||||
def patch_resize_transform(self):
|
||||
if self._patch_resize_transform is None:
|
||||
from torchvision import transforms
|
||||
|
||||
mean = (0.48145466, 0.4578275, 0.40821073)
|
||||
std = (0.26862954, 0.26130258, 0.27577711)
|
||||
|
||||
self._patch_resize_transform = transforms.Compose([
|
||||
transforms.Resize((224, 224), interpolation=Image.BICUBIC),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=mean, std=std),
|
||||
])
|
||||
return self._patch_resize_transform
|
||||
|
||||
def image_open(self, path: str) -> Tuple[Image.Image, int]:
|
||||
if path not in self._image_map:
|
||||
index = len(self._image_map)
|
||||
self._image_map[path] = (load_image(path), index)
|
||||
return self._image_map[path]
|
||||
|
||||
def tokenize_text(self, text: str) -> List[int]:
|
||||
media_tokens = {
|
||||
k: -int(i + 1)
|
||||
for i, k in enumerate(self.media_token.keys())
|
||||
}
|
||||
media_lengths = self.media_token.copy()
|
||||
|
||||
prompt_chunk = [self.tokenizer.bos_token_id]
|
||||
|
||||
# Pure Text
|
||||
condition = [
|
||||
media_token not in text for media_token in media_tokens.keys()
|
||||
]
|
||||
if all(condition):
|
||||
enc_chunk = prompt_chunk + \
|
||||
self.tokenizer(text, add_special_tokens=False)['input_ids']
|
||||
|
||||
# Multi-Modal Text
|
||||
else:
|
||||
enc_chunk = prompt_chunk
|
||||
pattern = '|'.join(map(re.escape, list(media_tokens.keys())))
|
||||
chunk_strs = re.split(f'({pattern})', text)
|
||||
chunk_strs = [x for x in chunk_strs if len(x) > 0]
|
||||
for idx, chunk_str in enumerate(chunk_strs):
|
||||
if chunk_str in media_tokens:
|
||||
enc_chunk += [media_tokens[chunk_str]] * \
|
||||
media_lengths[chunk_str]
|
||||
else:
|
||||
tmp_chunk = self.tokenizer(
|
||||
chunk_str, add_special_tokens=False)['input_ids']
|
||||
enc_chunk += tmp_chunk
|
||||
return enc_chunk
|
||||
|
||||
def convert(self, messages: Dict[str, List[Dict]]) -> str:
|
||||
texts = []
|
||||
image = []
|
||||
messages = messages['messages']
|
||||
for turn in messages:
|
||||
if turn['role'] == 'system':
|
||||
role = ''
|
||||
elif turn['role'] == 'user':
|
||||
role = 'Human: '
|
||||
else:
|
||||
role = 'AI: '
|
||||
if isinstance(turn['content'], str):
|
||||
text = f"{role}{turn['content']}"
|
||||
texts.append(text)
|
||||
else:
|
||||
for t in turn['content']:
|
||||
if isinstance(t, str):
|
||||
text = f'{role}{t}'
|
||||
else:
|
||||
text = f'{role}<image>'
|
||||
image.append(t['image'])
|
||||
texts.append(text)
|
||||
texts = '\n'.join(texts)
|
||||
texts += '\nAI: '
|
||||
return image, texts
|
||||
|
||||
def __call__(self, messages: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Args:
|
||||
messages: {[
|
||||
{'role': 'system', 'content': 'message1'},
|
||||
{'role': 'user', 'content': 'message2'},
|
||||
{'role': 'user', 'content': ['message2', {"image": 'image_path'}, 'message3', ...]},
|
||||
]}
|
||||
The 'role' should be choose from ['system', 'user', 'assistant'].
|
||||
The 'content' can be either str or List[Union[str, Dict]]
|
||||
Return:
|
||||
output: Dict[str, Tensor]
|
||||
"""
|
||||
output = {}
|
||||
images, text = self.convert(messages)
|
||||
|
||||
if len(images) > 0:
|
||||
pixel_values = []
|
||||
for image in images:
|
||||
pixel_values.append(
|
||||
self.patch_resize_transform(self.image_open(image)[0]))
|
||||
pixel_values = torch.stack(pixel_values, dim=0)
|
||||
else:
|
||||
pixel_values = None
|
||||
|
||||
input_ids = self.tokenize_text(text)
|
||||
input_ids = torch.LongTensor([input_ids])
|
||||
|
||||
output = {
|
||||
'pixel_values': pixel_values,
|
||||
'input_ids': input_ids,
|
||||
}
|
||||
|
||||
return output
|
||||
|
||||
|
||||
@PREPROCESSORS.register_module(
|
||||
Fields.multi_modal,
|
||||
module_name=Preprocessors.image_captioning_clip_interrogator_preprocessor)
|
||||
|
||||
@@ -247,6 +247,7 @@ class MultiModalTasks(object):
|
||||
video_temporal_grounding = 'video-temporal-grounding'
|
||||
text_to_video_synthesis = 'text-to-video-synthesis'
|
||||
efficient_diffusion_tuning = 'efficient-diffusion-tuning'
|
||||
multimodal_dialogue = 'multimodal-dialogue'
|
||||
|
||||
|
||||
class ScienceTasks(object):
|
||||
|
||||
@@ -1,17 +1,14 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import unittest
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.demo_utils import DemoCompatibilityCheck
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class CLIPInterrogatorTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
class CLIPInterrogatorTest(unittest.TestCase):
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_image_captioning_with_model(self):
|
||||
@@ -32,10 +29,6 @@ class CLIPInterrogatorTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
result = pipeline_caption(image)
|
||||
print(result[OutputKeys.CAPTION])
|
||||
|
||||
@unittest.skip('demo compatibility test is only enabled on a needed-basis')
|
||||
def test_demo_compatibility(self):
|
||||
self.compatibility_check()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -6,11 +6,10 @@ import numpy as np
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.demo_utils import DemoCompatibilityCheck
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class TransFaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
class TransFaceRecognitionTest(unittest.TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.task = Tasks.face_recognition
|
||||
@@ -31,10 +30,6 @@ class TransFaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
sim = np.dot(emb1[0], emb2[0])
|
||||
print(f'Cos similarity={sim:.3f}, img1:{img1} img2:{img2}')
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_demo_compatibility(self):
|
||||
self.compatibility_check()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -5,11 +5,10 @@ from modelscope.models import Model
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.demo_utils import DemoCompatibilityCheck
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class FastInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
class FastInstanceSegmentationTest(unittest.TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.task = Tasks.image_segmentation
|
||||
@@ -30,10 +29,6 @@ class FastInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
task=Tasks.image_segmentation, model=model, preprocessor=None)
|
||||
print(pipeline_parsing(input=self.image)[OutputKeys.LABELS])
|
||||
|
||||
@unittest.skip('demo compatibility test is only enabled on a needed-basis')
|
||||
def test_demo_compatibility(self):
|
||||
self.compatibility_check()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
100
tests/pipelines/test_mplug_owl_multimodal_dialogue.py
Normal file
100
tests/pipelines/test_mplug_owl_multimodal_dialogue.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import unittest
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from modelscope.models import Model
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
|
||||
class MplugOwlMultimodalDialogueTest(unittest.TestCase):
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_multimodal_dialogue_with_model(self):
|
||||
model = Model.from_pretrained(
|
||||
'damo/multi-modal_mplug_owl_multimodal-dialogue_7b')
|
||||
pipeline_multimodal_dialogue = pipeline(
|
||||
task=Tasks.multimodal_dialogue,
|
||||
model=model,
|
||||
)
|
||||
image = 'data/resource/portrait_input.png'
|
||||
system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.'
|
||||
system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions."
|
||||
messages = {
|
||||
'messages': [
|
||||
{
|
||||
'role': 'system',
|
||||
'content': system_prompt_1 + ' ' + system_prompt_2
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': [{
|
||||
'image': image
|
||||
}]
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'Describe the facial expression of the man.'
|
||||
},
|
||||
]
|
||||
}
|
||||
result = pipeline_multimodal_dialogue(messages)
|
||||
print(result[OutputKeys.TEXT])
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_multimodal_dialogue_with_name(self):
|
||||
pipeline_multimodal_dialogue = pipeline(
|
||||
Tasks.multimodal_dialogue,
|
||||
model='damo/multi-modal_mplug_owl_multimodal-dialogue_7b')
|
||||
image = 'data/resource/portrait_input.png'
|
||||
system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.'
|
||||
system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions."
|
||||
messages = {
|
||||
'messages': [
|
||||
{
|
||||
'role': 'system',
|
||||
'content': system_prompt_1 + ' ' + system_prompt_2
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': [{
|
||||
'image': image
|
||||
}]
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'Describe the facial expression of the man.'
|
||||
},
|
||||
]
|
||||
}
|
||||
result = pipeline_multimodal_dialogue(messages)
|
||||
print(result[OutputKeys.TEXT])
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_multimodal_dialogue_with_text(self):
|
||||
pipeline_multimodal_dialogue = pipeline(
|
||||
Tasks.multimodal_dialogue,
|
||||
model='damo/multi-modal_mplug_owl_multimodal-dialogue_7b')
|
||||
system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.'
|
||||
system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions."
|
||||
messages = {
|
||||
'messages': [
|
||||
{
|
||||
'role': 'system',
|
||||
'content': system_prompt_1 + ' ' + system_prompt_2
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'Where is the captial of China?'
|
||||
},
|
||||
]
|
||||
}
|
||||
result = pipeline_multimodal_dialogue(messages)
|
||||
print(result[OutputKeys.TEXT])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -1,6 +1,5 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os.path
|
||||
import unittest
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
@@ -81,10 +80,6 @@ class SpeakerVerificationTest(unittest.TestCase):
|
||||
print(result)
|
||||
self.assertTrue(OutputKeys.TEXT in result)
|
||||
|
||||
@unittest.skip('demo compatibility test is only enabled on a needed-basis')
|
||||
def test_demo_compatibility(self):
|
||||
self.compatibility_check()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user