Support mPLUG-Owl model.

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/12610417
2026-02-24 04:01:10 +01:00 · 2023-05-15 16:32:46 +08:00
parent 1d28c5b730
commit b9c8c99776
18 changed files with 2181 additions and 29 deletions
--- a/data/test
+++ b/data/test
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -202,6 +202,7 @@ class Models(object):
    hitea = 'hitea'
    soonet = 'soonet'
    efficient_diffusion_tuning = 'efficient-diffusion-tuning'
+    mplug_owl = 'mplug-owl'
    clip_interrogator = 'clip-interrogator'

    # science models
@@ -512,6 +513,7 @@ class Pipelines(object):
    gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding'
    soonet_video_temporal_grounding = 'soonet-video-temporal-grounding'
    efficient_diffusion_tuning = 'efficient-diffusion-tuning'
+    multimodal_dialogue = 'multimodal-dialogue'

    # science tasks
    protein_structure = 'unifold-protein-structure'
@@ -1030,6 +1032,7 @@ class Preprocessors(object):
    vldoc_preprocessor = 'vldoc-preprocessor'
    hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
    diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor'
+    mplug_owl_preprocessor = 'mplug-owl-preprocessor'
    image_captioning_clip_interrogator_preprocessor = 'image-captioning-clip-interrogator-preprocessor'

    # science preprocessor
--- a/modelscope/models/multi_modal/init.py
+++ b/modelscope/models/multi_modal/init.py
@@ -20,6 +20,7 @@ if TYPE_CHECKING:
    from .vldoc import VLDocForDocVLEmbedding
    from .video_synthesis import TextToVideoSynthesis
    from .efficient_diffusion_tuning import EfficientStableDiffusion
+    from .mplug_owl import MplugOwlForConditionalGeneration
    from .clip_interrogator import CLIP_Interrogator

 else:
@@ -39,6 +40,7 @@ else:
        'vldoc': ['VLDocForDocVLEmbedding'],
        'video_synthesis': ['TextToVideoSynthesis'],
        'efficient_diffusion_tuning': ['EfficientStableDiffusion'],
+        'mplug_owl': ['MplugOwlForConditionalGeneration'],
        'clip_interrogator': ['CLIP_Interrogator'],
    }

--- a/modelscope/models/multi_modal/mplug_owl/init.py
+++ b/modelscope/models/multi_modal/mplug_owl/init.py
@@ -0,0 +1,18 @@
+# Copyright 2021-2023 The Alibaba DAMO mPLUG Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_mplug_owl import (MplugOwlConfig, MplugOwlVisionConfig,
+                                      MplugOwlVisualAbstractorConfig)
+from .modeling_mplug_owl import MplugOwlForConditionalGeneration
--- a/modelscope/models/multi_modal/mplug_owl/configuration_mplug_owl.py
+++ b/modelscope/models/multi_modal/mplug_owl/configuration_mplug_owl.py
@@ -0,0 +1,257 @@
+# Copyright 2021-2023 The Alibaba DAMO mPLUG Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MPLUG OWL model configuration """
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.models.auto import CONFIG_MAPPING
+from transformers.utils import logging
+
+from modelscope.utils.constant import Tasks
+
+logger = logging.get_logger()
+
+
+class MplugOwlVisionConfig(PretrainedConfig):
+    r"""
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    ```"""
+
+    model_type = 'mplug_owl_vision_model'
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4096,
+        projection_dim=768,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act='quick_gelu',
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        use_flash_attn=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from MplugOwlConfig
+        if config_dict.get('model_type') == 'mplug_owl':
+            config_dict = config_dict['vision_config']
+
+        if 'model_type' in config_dict and hasattr(
+                cls,
+                'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MplugOwlVisualAbstractorConfig(PretrainedConfig):
+
+    model_type = 'MPlugOwlVisualAbstractor'
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=6,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        encoder_hidden_size=1024,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from MplugOwlConfig
+        if config_dict.get('model_type') == 'mplug_owl':
+            config_dict = config_dict['abstractor_config']
+
+        if 'model_type' in config_dict and hasattr(
+                cls,
+                'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MplugOwlConfig(PretrainedConfig):
+    r"""
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MplugOwlVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MplugOwlVisualAbstractorConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+
+    model_type = 'mplug_owl'
+    is_composition = True
+
+    def __init__(self,
+                 task=Tasks.multimodal_dialogue,
+                 vision_config=None,
+                 visual_abstractor_config=None,
+                 text_config=None,
+                 num_query_tokens=64,
+                 **kwargs):
+
+        super().__init__(**kwargs)
+        self.task = task
+        if vision_config is None:
+            vision_config = MplugOwlVisionConfig().to_dict()
+            logger.info('vision_config is None.')
+
+        if visual_abstractor_config is None:
+            visual_abstractor_config = {}
+            logger.info('abstractor_config is None. ')
+
+        if text_config is None:
+            # we use LLAMA 7b by default
+            from transformers.models.llama.configuration_llama import \
+                LlamaConfig
+            text_config = LlamaConfig(pad_token_id=2).to_dict()
+            logger.info('text_config is None.')
+
+        self.vision_config = MplugOwlVisionConfig(**vision_config)
+        self.visual_abstractor_config = MplugOwlVisualAbstractorConfig(
+            **visual_abstractor_config)
+        text_model_type = text_config[
+            'model_type'] if 'model_type' in text_config else 'llama'
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+
+        self.num_query_tokens = num_query_tokens
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_abstractor_text_configs(
+        cls,
+        vision_config: MplugOwlVisionConfig,
+        visual_abstractor_config: MplugOwlVisualAbstractorConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Returns:
+            [`MplugOwlConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            visual_abstractor_config=visual_abstractor_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['vision_config'] = self.vision_config.to_dict()
+        tmp = self.visual_abstractor_config.to_dict()
+        output['visual_abstractor_config'] = tmp
+        output['text_config'] = self.text_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        return output
--- a/modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
+++ b/modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -1369,6 +1369,10 @@ TASK_OUTPUTS = {
    # {"text": "this is a text answser. "}
    Tasks.video_question_answering: [OutputKeys.TEXT],

+    # Multimodal Dialogue result for a sample
+    # {"text": "this is a text response. "}
+    Tasks.multimodal_dialogue: [OutputKeys.TEXT],
+
    # auto_speech_recognition result for a single sample
    # {
    #    "text": "每天都要快乐喔"
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -337,6 +337,9 @@ TASK_INPUTS = {
    Tasks.video_captioning: [InputType.VIDEO, {
        'video': InputType.VIDEO,
    }],
+    Tasks.multimodal_dialogue: {
+        'messages': InputType.LIST,
+    },
    Tasks.visual_grounding: {
        'image': InputType.IMAGE,
        'text': InputType.TEXT
--- a/modelscope/pipelines/multi_modal/init.py
+++ b/modelscope/pipelines/multi_modal/init.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
    from .diffusers_wrapped import StableDiffusionWrapperPipeline, ChineseStableDiffusionPipeline
    from .soonet_video_temporal_grounding_pipeline import SOONetVideoTemporalGroundingPipeline
    from .text_to_video_synthesis_pipeline import TextToVideoSynthesisPipeline
+    from .multimodal_dialogue_pipeline import MultimodalDialoguePipeline
 else:
    _import_structure = {
        'image_captioning_pipeline': ['ImageCaptioningPipeline'],
@@ -45,6 +46,7 @@ else:
        'soonet_video_temporal_grounding_pipeline':
        ['SOONetVideoTemporalGroundingPipeline'],
        'text_to_video_synthesis_pipeline': ['TextToVideoSynthesisPipeline'],
+        'multimodal_dialogue_pipeline': ['MultimodalDialoguePipeline']
    }

    import sys
--- a/modelscope/pipelines/multi_modal/multimodal_dialogue_pipeline.py
+++ b/modelscope/pipelines/multi_modal/multimodal_dialogue_pipeline.py
@@ -0,0 +1,90 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import MplugOwlForConditionalGeneration
+from modelscope.outputs import OutputKeys, TokenGeneratorOutput
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import MplugOwlPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.multimodal_dialogue, module_name=Pipelines.multimodal_dialogue)
+class MultimodalDialoguePipeline(Pipeline):
+    r""" Multimodal Dialogue Pipeline.
+
+    Examples:
+    >>> from modelscope.pipelines import pipeline
+    >>> chatbot = pipeline('multimodal-dialogue', 'damo/multi-modal_mplug_owl_multimodal-dialogue_7b')
+    >>> image = 'data/resource/portrait_input.png'
+    >>> system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.'
+    >>> system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    >>> messages = {
+    >>>       'messages': [
+    >>>            {
+    >>>                'role': 'system',
+    >>>                'content': system_prompt_1 + ' ' + system_prompt_2
+    >>>            },
+    >>>            {
+    >>>                'role': 'user',
+    >>>                'content': [{
+    >>>                    'image': image
+    >>>                }]
+    >>>            },
+    >>>            {
+    >>>                'role': 'user',
+    >>>                'content': 'Describe the facial expression of the man.'
+    >>>            },
+    >>>        ]
+    >>>    }
+    >>> chatbot(messages)
+    >>> {
+    >>>     "text": he is angry.
+    >>> }
+    >>>
+    """
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a multimodal dialogue pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+        if preprocessor is None:
+            if isinstance(self.model, MplugOwlForConditionalGeneration):
+                self.preprocessor = MplugOwlPreprocessor(self.model.model_dir)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        """
+        the `forward_params` can be the generation configurations listed in transformers library.
+        """
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        if isinstance(self.model, MplugOwlForConditionalGeneration):
+            output = self.preprocessor.tokenizer.decode(
+                inputs[0], skip_special_tokens=True)
+            inputs = {OutputKeys.TEXT: output}
+        return inputs
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -20,7 +20,7 @@ if TYPE_CHECKING:
    from .tts import KanttsDataPreprocessor
    from .multi_modal import (DiffusionImageGenerationPreprocessor,
                              OfaPreprocessor, MPlugPreprocessor,
-                              HiTeAPreprocessor,
+                              HiTeAPreprocessor, MplugOwlPreprocessor,
                              ImageCaptioningClipInterrogatorPreprocessor)
    from .nlp import (
        DocumentSegmentationTransformersPreprocessor,
@@ -71,7 +71,7 @@ else:
        'tts': ['KanttsDataPreprocessor'],
        'multi_modal': [
            'DiffusionImageGenerationPreprocessor', 'OfaPreprocessor',
-            'MPlugPreprocessor', 'HiTeAPreprocessor',
+            'MPlugPreprocessor', 'HiTeAPreprocessor', 'MplugOwlPreprocessor',
            'ImageCaptioningClipInterrogatorPreprocessor'
        ],
        'nlp': [
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
+import re
 from io import BytesIO
 from typing import Any, Dict, List, Tuple, Union

@@ -29,7 +30,7 @@ from .ofa.utils.constant import OFA_TASK_KEY_MAPPING

 __all__ = [
    'DiffusionImageGenerationPreprocessor', 'OfaPreprocessor',
-    'MPlugPreprocessor', 'HiTeAPreprocessor'
+    'MPlugPreprocessor', 'HiTeAPreprocessor', 'MplugOwlPreprocessor'
 ]


@@ -644,6 +645,148 @@ class HiTeAPreprocessor(Preprocessor):
            return output


+@PREPROCESSORS.register_module(
+    Fields.multi_modal, module_name=Preprocessors.mplug_owl_preprocessor)
+class MplugOwlPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model_dir = model_dir
+        self.mode = mode
+
+        self._tokenizer = None
+        self._patch_resize_transform = None
+        self.media_token = {'<image>': 65}
+        self._image_map = {}
+
+    @property
+    def tokenizer(self):
+        from modelscope.models.nlp.llama import LlamaTokenizer
+
+        if self._tokenizer is None:
+            self._tokenizer = LlamaTokenizer.from_pretrained(self.model_dir)
+        return self._tokenizer
+
+    @property
+    def patch_resize_transform(self):
+        if self._patch_resize_transform is None:
+            from torchvision import transforms
+
+            mean = (0.48145466, 0.4578275, 0.40821073)
+            std = (0.26862954, 0.26130258, 0.27577711)
+
+            self._patch_resize_transform = transforms.Compose([
+                transforms.Resize((224, 224), interpolation=Image.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=mean, std=std),
+            ])
+        return self._patch_resize_transform
+
+    def image_open(self, path: str) -> Tuple[Image.Image, int]:
+        if path not in self._image_map:
+            index = len(self._image_map)
+            self._image_map[path] = (load_image(path), index)
+        return self._image_map[path]
+
+    def tokenize_text(self, text: str) -> List[int]:
+        media_tokens = {
+            k: -int(i + 1)
+            for i, k in enumerate(self.media_token.keys())
+        }
+        media_lengths = self.media_token.copy()
+
+        prompt_chunk = [self.tokenizer.bos_token_id]
+
+        # Pure Text
+        condition = [
+            media_token not in text for media_token in media_tokens.keys()
+        ]
+        if all(condition):
+            enc_chunk = prompt_chunk + \
+                self.tokenizer(text, add_special_tokens=False)['input_ids']
+
+        # Multi-Modal Text
+        else:
+            enc_chunk = prompt_chunk
+            pattern = '|'.join(map(re.escape, list(media_tokens.keys())))
+            chunk_strs = re.split(f'({pattern})', text)
+            chunk_strs = [x for x in chunk_strs if len(x) > 0]
+            for idx, chunk_str in enumerate(chunk_strs):
+                if chunk_str in media_tokens:
+                    enc_chunk += [media_tokens[chunk_str]] * \
+                        media_lengths[chunk_str]
+                else:
+                    tmp_chunk = self.tokenizer(
+                        chunk_str, add_special_tokens=False)['input_ids']
+                    enc_chunk += tmp_chunk
+        return enc_chunk
+
+    def convert(self, messages: Dict[str, List[Dict]]) -> str:
+        texts = []
+        image = []
+        messages = messages['messages']
+        for turn in messages:
+            if turn['role'] == 'system':
+                role = ''
+            elif turn['role'] == 'user':
+                role = 'Human: '
+            else:
+                role = 'AI: '
+            if isinstance(turn['content'], str):
+                text = f"{role}{turn['content']}"
+                texts.append(text)
+            else:
+                for t in turn['content']:
+                    if isinstance(t, str):
+                        text = f'{role}{t}'
+                    else:
+                        text = f'{role}<image>'
+                        image.append(t['image'])
+                    texts.append(text)
+        texts = '\n'.join(texts)
+        texts += '\nAI: '
+        return image, texts
+
+    def __call__(self, messages: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Args:
+            messages: {[
+                {'role': 'system', 'content': 'message1'},
+                {'role': 'user', 'content': 'message2'},
+                {'role': 'user', 'content': ['message2', {"image": 'image_path'}, 'message3', ...]},
+            ]}
+            The 'role' should be choose from ['system', 'user', 'assistant'].
+            The 'content' can be either str or List[Union[str, Dict]]
+        Return:
+            output: Dict[str, Tensor]
+        """
+        output = {}
+        images, text = self.convert(messages)
+
+        if len(images) > 0:
+            pixel_values = []
+            for image in images:
+                pixel_values.append(
+                    self.patch_resize_transform(self.image_open(image)[0]))
+                pixel_values = torch.stack(pixel_values, dim=0)
+        else:
+            pixel_values = None
+
+        input_ids = self.tokenize_text(text)
+        input_ids = torch.LongTensor([input_ids])
+
+        output = {
+            'pixel_values': pixel_values,
+            'input_ids': input_ids,
+        }
+
+        return output
+
+
@PREPROCESSORS.register_module(
    Fields.multi_modal,
    module_name=Preprocessors.image_captioning_clip_interrogator_preprocessor)
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -247,6 +247,7 @@ class MultiModalTasks(object):
    video_temporal_grounding = 'video-temporal-grounding'
    text_to_video_synthesis = 'text-to-video-synthesis'
    efficient_diffusion_tuning = 'efficient-diffusion-tuning'
+    multimodal_dialogue = 'multimodal-dialogue'


 class ScienceTasks(object):
--- a/tests/pipelines/test_clip_interrogator.py
+++ b/tests/pipelines/test_clip_interrogator.py
@@ -1,17 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest

-from PIL import Image
-
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
-from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level


-class CLIPInterrogatorTest(unittest.TestCase, DemoCompatibilityCheck):
+class CLIPInterrogatorTest(unittest.TestCase):

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_image_captioning_with_model(self):
@@ -32,10 +29,6 @@ class CLIPInterrogatorTest(unittest.TestCase, DemoCompatibilityCheck):
        result = pipeline_caption(image)
        print(result[OutputKeys.CAPTION])

-    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
-    def test_demo_compatibility(self):
-        self.compatibility_check()
-

 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_face_recognition_onnx_transface.py
+++ b/tests/pipelines/test_face_recognition_onnx_transface.py
@@ -6,11 +6,10 @@ import numpy as np
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
-from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level


-class TransFaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+class TransFaceRecognitionTest(unittest.TestCase):

    def setUp(self) -> None:
        self.task = Tasks.face_recognition
@@ -31,10 +30,6 @@ class TransFaceRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
            sim = np.dot(emb1[0], emb2[0])
            print(f'Cos similarity={sim:.3f}, img1:{img1}  img2:{img2}')

-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_demo_compatibility(self):
-        self.compatibility_check()
-

 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_fast_instance_segmentation.py
+++ b/tests/pipelines/test_fast_instance_segmentation.py
@@ -5,11 +5,10 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
-from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level


-class FastInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+class FastInstanceSegmentationTest(unittest.TestCase):

    def setUp(self) -> None:
        self.task = Tasks.image_segmentation
@@ -30,10 +29,6 @@ class FastInstanceSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
            task=Tasks.image_segmentation, model=model, preprocessor=None)
        print(pipeline_parsing(input=self.image)[OutputKeys.LABELS])

-    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
-    def test_demo_compatibility(self):
-        self.compatibility_check()
-

 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_mplug_owl_multimodal_dialogue.py
+++ b/tests/pipelines/test_mplug_owl_multimodal_dialogue.py
@@ -0,0 +1,100 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from PIL import Image
+
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MplugOwlMultimodalDialogueTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_multimodal_dialogue_with_model(self):
+        model = Model.from_pretrained(
+            'damo/multi-modal_mplug_owl_multimodal-dialogue_7b')
+        pipeline_multimodal_dialogue = pipeline(
+            task=Tasks.multimodal_dialogue,
+            model=model,
+        )
+        image = 'data/resource/portrait_input.png'
+        system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.'
+        system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions."
+        messages = {
+            'messages': [
+                {
+                    'role': 'system',
+                    'content': system_prompt_1 + ' ' + system_prompt_2
+                },
+                {
+                    'role': 'user',
+                    'content': [{
+                        'image': image
+                    }]
+                },
+                {
+                    'role': 'user',
+                    'content': 'Describe the facial expression of the man.'
+                },
+            ]
+        }
+        result = pipeline_multimodal_dialogue(messages)
+        print(result[OutputKeys.TEXT])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_multimodal_dialogue_with_name(self):
+        pipeline_multimodal_dialogue = pipeline(
+            Tasks.multimodal_dialogue,
+            model='damo/multi-modal_mplug_owl_multimodal-dialogue_7b')
+        image = 'data/resource/portrait_input.png'
+        system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.'
+        system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions."
+        messages = {
+            'messages': [
+                {
+                    'role': 'system',
+                    'content': system_prompt_1 + ' ' + system_prompt_2
+                },
+                {
+                    'role': 'user',
+                    'content': [{
+                        'image': image
+                    }]
+                },
+                {
+                    'role': 'user',
+                    'content': 'Describe the facial expression of the man.'
+                },
+            ]
+        }
+        result = pipeline_multimodal_dialogue(messages)
+        print(result[OutputKeys.TEXT])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_multimodal_dialogue_with_text(self):
+        pipeline_multimodal_dialogue = pipeline(
+            Tasks.multimodal_dialogue,
+            model='damo/multi-modal_mplug_owl_multimodal-dialogue_7b')
+        system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.'
+        system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions."
+        messages = {
+            'messages': [
+                {
+                    'role': 'system',
+                    'content': system_prompt_1 + ' ' + system_prompt_2
+                },
+                {
+                    'role': 'user',
+                    'content': 'Where is the captial of China?'
+                },
+            ]
+        }
+        result = pipeline_multimodal_dialogue(messages)
+        print(result[OutputKeys.TEXT])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/pipelines/test_speaker_verification.py
+++ b/tests/pipelines/test_speaker_verification.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

-import os.path
 import unittest
 from typing import Any, Dict, List, Union

@@ -81,10 +80,6 @@ class SpeakerVerificationTest(unittest.TestCase):
        print(result)
        self.assertTrue(OutputKeys.TEXT in result)

-    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
-    def test_demo_compatibility(self):
-        self.compatibility_check()
-

 if __name__ == '__main__':
    unittest.main()