add open vocabulary detection

添加了开放词汇目标检测任务和模型 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11548569
2026-02-24 04:01:10 +01:00 · 2023-02-10 06:37:51 +00:00
parent ca1321f53f
commit 5e88cfe693
10 changed files with 590 additions and 2 deletions
--- a/data/test/images/image_open_vocabulary_detection.jpg
+++ b/data/test/images/image_open_vocabulary_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5861ca8955f8ff906abe78f2b32bc49deee2832f4518ffe4bb584653f3c9e9
+size 187443
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -91,6 +91,7 @@ class Models(object):
    image_probing_model = 'image-probing-model'
    defrcn = 'defrcn'
    image_face_fusion = 'image-face-fusion'
+    open_vocabulary_detection_vild = 'open-vocabulary-detection-vild'
    ecbsr = 'ecbsr'
    msrresnet_lite = 'msrresnet-lite'
    object_detection_3d = 'object_detection_3d'
@@ -346,6 +347,7 @@ class Pipelines(object):
    image_structured_model_probing = 'image-structured-model-probing'
    image_fewshot_detection = 'image-fewshot-detection'
    image_face_fusion = 'image-face-fusion'
+    open_vocabulary_detection_vild = 'open-vocabulary-detection-vild'
    ddpm_image_semantic_segmentation = 'ddpm-image-semantic-segmentation'
    video_colorization = 'video-colorization'
    motion_generattion = 'mdm-motion-generation'
--- a/modelscope/models/cv/open_vocabulary_detection_vild/init.py
+++ b/modelscope/models/cv/open_vocabulary_detection_vild/init.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .vild import OpenVocabularyDetectionViLD
+
+else:
+    _import_structure = {
+        'vild': ['OpenVocabularyDetectionViLD'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
--- a/modelscope/models/cv/open_vocabulary_detection_vild/vild.py
+++ b/modelscope/models/cv/open_vocabulary_detection_vild/vild.py
@@ -0,0 +1,390 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+from typing import Any, Dict, Union
+
+import clip
+import numpy as np
+import tensorflow.compat.v1 as tf
+import torch.cuda
+from scipy.special import softmax
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_model import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.open_vocabulary_detection,
+    module_name=Models.open_vocabulary_detection_vild)
+class OpenVocabularyDetectionViLD(Model):
+    """
+    Vild: Open-Vocabulary Detection via Vision and Language Knowledge Distillation
+    https://arxiv.org/abs/2104.13921
+    """
+
+    def __init__(self, model_dir, *args, **kwargs):
+        self.model_dir = model_dir
+        device_name = kwargs.get('device', 'gpu')
+        self._device_name = device_name
+
+        model_path = os.path.join(model_dir, ModelFile.TF_GRAPH_FILE)
+        # model_path = os.path.join(model_dir, 'test_out.pb')
+        graph = tf.Graph()
+        with graph.as_default():
+            config = tf.ConfigProto()
+            config.gpu_options.per_process_gpu_memory_fraction = 0.2
+            compute_graph = tf.Graph()
+            compute_graph.as_default()
+            sess = tf.Session(config=config)
+
+            with tf.gfile.GFile(model_path, 'rb') as fid:
+                graph_def = tf.GraphDef()
+                graph_def.ParseFromString(fid.read())
+                tf.import_graph_def(graph_def, name='')
+        self.sess = sess
+
+        #
+        # clip.available_models()
+        self.clip, self.clip_preprocess = clip.load(
+            'ViT-B/32', device='cuda:0')
+
+        self.prompt_engineering = True
+        self.this_is = True
+        self.temperature = 100.0
+        self.use_softmax = False
+        self.out_name = [
+            'RoiBoxes:0', 'RoiScores:0', '2ndStageBoxes:0',
+            '2ndStageScoresUnused:0', 'BoxOutputs:0', 'MaskOutputs:0',
+            'VisualFeatOutputs:0', 'ImageInfo:0'
+        ]
+
+    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
+        return self.postprocess(self.forward(*args, **kwargs))
+
+    def forward(self, img: np.array, category_names: str,
+                **kwargs) -> Dict[str, Any]:
+        """
+        Run the forward pass for a model.
+
+        Returns:
+            Dict[str, Any]: output from the model forward pass
+        """
+        (roi_boxes, roi_scores, detection_boxes, scores_unused, box_outputs,
+         detection_masks, visual_features, image_info) = self.sess.run(
+             self.out_name, feed_dict={'Placeholder:0': img})
+        return_dict = {
+            'roi_boxes': roi_boxes,
+            'roi_scores': roi_scores,
+            'detection_boxes': detection_boxes,
+            'scores_unused': scores_unused,
+            'box_outputs': box_outputs,
+            'detection_masks': detection_masks,
+            'visual_features': visual_features,
+            'image_info': image_info,
+            'category_names': category_names
+        }
+        return return_dict
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        """ Model specific postprocess and convert model output to
+        standard model outputs.
+
+        Args:
+            inputs:  input data
+
+        Return:
+            dict of results:  a dict containing outputs of model, each
+                output should have the standard output name.
+        """
+        max_boxes_to_return = 25
+        nms_threshold = 0.6
+        min_rpn_score_thresh = 0.9
+        min_box_area = 220
+
+        roi_boxes = inputs['roi_boxes']
+        roi_scores = inputs['roi_scores']
+        detection_boxes = inputs['detection_boxes']
+        scores_unused = inputs['scores_unused']
+        box_outputs = inputs['box_outputs']
+        detection_masks = inputs['detection_masks']
+        visual_features = inputs['visual_features']
+        image_info = inputs['image_info']
+        category_names = inputs['category_names']
+
+        #################################################################
+        # Preprocessing categories and get params
+        category_names = [x.strip() for x in category_names.split(';')]
+        category_names = ['background'] + category_names
+        categories = [{
+            'name': item,
+            'id': idx + 1,
+        } for idx, item in enumerate(category_names)]
+        # category_indices = {cat['id']: cat for cat in categories}
+
+        #################################################################
+        # Obtain results and read image
+
+        roi_boxes = np.squeeze(roi_boxes, axis=0)  # squeeze
+        # no need to clip the boxes, already done
+        roi_scores = np.squeeze(roi_scores, axis=0)
+
+        detection_boxes = np.squeeze(detection_boxes, axis=(0, 2))
+        scores_unused = np.squeeze(scores_unused, axis=0)
+        box_outputs = np.squeeze(box_outputs, axis=0)
+        detection_masks = np.squeeze(detection_masks, axis=0)
+        visual_features = np.squeeze(visual_features, axis=0)
+
+        # obtain image info
+        image_info = np.squeeze(image_info, axis=0)
+        image_scale = np.tile(image_info[2:3, :], (1, 2))
+
+        # rescale
+        rescaled_detection_boxes = detection_boxes / image_scale
+
+        #################################################################
+        # Filter boxes
+
+        # Apply non-maximum suppression to detected boxes with nms threshold.
+        nmsed_indices = nms(detection_boxes, roi_scores, thresh=nms_threshold)
+
+        # Compute RPN box size.
+        box_sizes = (rescaled_detection_boxes[:, 2]
+                     - rescaled_detection_boxes[:, 0]) * (
+                         rescaled_detection_boxes[:, 3]
+                         - rescaled_detection_boxes[:, 1])
+
+        # Filter out invalid rois (nmsed rois)
+        valid_indices = np.where(
+            np.logical_and(
+                np.isin(
+                    np.arange(len(roi_scores), dtype=np.int), nmsed_indices),
+                np.logical_and(
+                    np.logical_not(np.all(roi_boxes == 0., axis=-1)),
+                    np.logical_and(roi_scores >= min_rpn_score_thresh,
+                                   box_sizes > min_box_area))))[0]
+        # print('number of valid indices', len(valid_indices))
+
+        # detection_roi_scores = roi_scores[valid_indices][:max_boxes_to_return,
+        #                                                  ...]
+        detection_boxes = detection_boxes[valid_indices][:max_boxes_to_return,
+                                                         ...]
+        detection_masks = detection_masks[valid_indices][:max_boxes_to_return,
+                                                         ...]
+        detection_visual_feat = visual_features[
+            valid_indices][:max_boxes_to_return, ...]
+        rescaled_detection_boxes = rescaled_detection_boxes[
+            valid_indices][:max_boxes_to_return, ...]
+
+        #################################################################
+        # Compute text embeddings and detection scores, and rank results
+        text_features = self._build_text_embedings(categories)
+
+        raw_scores = detection_visual_feat.dot(text_features.T)
+        if self.use_softmax:
+            scores_all = softmax(self.temperature * raw_scores, axis=-1)
+        else:
+            scores_all = raw_scores
+
+        # Results are ranked by scores
+        indices = np.argsort(-np.max(scores_all, axis=1))
+        # indices_fg = np.array(
+        #     [i for i in indices if np.argmax(scores_all[i]) != 0])
+
+        #################################################################
+        # Plot detected boxes on the input image.
+        ymin, xmin, ymax, xmax = np.split(rescaled_detection_boxes, 4, axis=-1)
+        processed_boxes = np.concatenate([xmin, ymin, xmax, ymax], axis=-1)
+
+        n_boxes = processed_boxes.shape[0]
+        # print(rescaled_detection_boxes)
+
+        categories = []
+        bboxes = []
+        scores = []
+        labels = []
+
+        for anno_idx in indices[0:int(n_boxes)]:
+            anno_bbox = processed_boxes[anno_idx]
+            anno_scores = scores_all[anno_idx]
+
+            if np.argmax(anno_scores) == 0:
+                continue
+            bboxes.append(anno_bbox)
+            scores.append(anno_scores[1:])
+            categories.append(category_names[1:])
+            labels.append(np.argmax(anno_scores) - 1)
+        bboxes = np.vstack(bboxes)
+        scores = np.vstack(scores)
+
+        return scores, categories, bboxes
+
+    def _build_text_embedings(self, categories):
+
+        def processed_name(name, rm_dot=False):
+            # _ for lvis
+            # / for obj365
+            res = name.replace('_', ' ').replace('/', ' or ').lower()
+            if rm_dot:
+                res = res.rstrip('.')
+            return res
+
+        def article(name):
+            return 'an' if name[0] in 'aeiou' else 'a'
+
+        templates = multiple_templates
+
+        run_on_gpu = torch.cuda.is_available()
+
+        with torch.no_grad():
+            all_text_embeddings = []
+            # print('Building text embeddings...')
+            for category in categories:
+                texts = [
+                    template.format(
+                        processed_name(category['name'], rm_dot=True),
+                        article=article(category['name']))
+                    for template in templates
+                ]
+                if self.this_is:
+                    texts = [
+                        'This is ' + text if text.startswith('a')
+                        or text.startswith('the') else text for text in texts
+                    ]
+                # tokenize
+                texts = clip.tokenize(texts)
+                if run_on_gpu:
+                    texts = texts.cuda()
+                # embed with text encoder
+                text_embeddings = self.clip.encode_text(texts)
+                text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
+                text_embedding = text_embeddings.mean(dim=0)
+                text_embedding /= text_embedding.norm()
+                all_text_embeddings.append(text_embedding)
+            all_text_embeddings = torch.stack(all_text_embeddings, dim=1)
+            if run_on_gpu:
+                all_text_embeddings = all_text_embeddings.cuda()
+        return all_text_embeddings.cpu().numpy().T
+
+
+multiple_templates = [
+    'There is {article} {} in the scene.',
+    'There is the {} in the scene.',
+    'a photo of {article} {} in the scene.',
+    'a photo of the {} in the scene.',
+    'a photo of one {} in the scene.',
+    'itap of {article} {}.',
+    'itap of my {}.',  # itap: I took a picture of
+    'itap of the {}.',
+    'a photo of {article} {}.',
+    'a photo of my {}.',
+    'a photo of the {}.',
+    'a photo of one {}.',
+    'a photo of many {}.',
+    'a good photo of {article} {}.',
+    'a good photo of the {}.',
+    'a bad photo of {article} {}.',
+    'a bad photo of the {}.',
+    'a photo of a nice {}.',
+    'a photo of the nice {}.',
+    'a photo of a cool {}.',
+    'a photo of the cool {}.',
+    'a photo of a weird {}.',
+    'a photo of the weird {}.',
+    'a photo of a small {}.',
+    'a photo of the small {}.',
+    'a photo of a large {}.',
+    'a photo of the large {}.',
+    'a photo of a clean {}.',
+    'a photo of the clean {}.',
+    'a photo of a dirty {}.',
+    'a photo of the dirty {}.',
+    'a bright photo of {article} {}.',
+    'a bright photo of the {}.',
+    'a dark photo of {article} {}.',
+    'a dark photo of the {}.',
+    'a photo of a hard to see {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of {article} {}.',
+    'a low resolution photo of the {}.',
+    'a cropped photo of {article} {}.',
+    'a cropped photo of the {}.',
+    'a close-up photo of {article} {}.',
+    'a close-up photo of the {}.',
+    'a jpeg corrupted photo of {article} {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a blurry photo of {article} {}.',
+    'a blurry photo of the {}.',
+    'a pixelated photo of {article} {}.',
+    'a pixelated photo of the {}.',
+    'a black and white photo of the {}.',
+    'a black and white photo of {article} {}.',
+    'a plastic {}.',
+    'the plastic {}.',
+    'a toy {}.',
+    'the toy {}.',
+    'a plushie {}.',
+    'the plushie {}.',
+    'a cartoon {}.',
+    'the cartoon {}.',
+    'an embroidered {}.',
+    'the embroidered {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+]
+
+
+def nms(dets, scores, thresh, max_dets=1000):
+    """Non-maximum suppression.
+    Args:
+        dets: [N, 4]
+        scores: [N,]
+        thresh: iou threshold. Float
+        max_dets: int.
+    """
+    y1 = dets[:, 0]
+    x1 = dets[:, 1]
+    y2 = dets[:, 2]
+    x2 = dets[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0 and len(keep) < max_dets:
+        i = order[0]
+        keep.append(i)
+
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        intersection = w * h
+        overlap = intersection / (
+            areas[i] + areas[order[1:]] - intersection + 1e-12)
+
+        inds = np.where(overlap <= thresh)[0]
+        order = order[inds + 1]
+    return keep
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -249,6 +249,8 @@ TASK_OUTPUTS = {
    [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
    Tasks.domain_specific_object_detection:
    [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
+    Tasks.open_vocabulary_detection:
+    [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],

    # video object detection result for single sample
    #   {
@@ -419,8 +421,9 @@ TASK_OUTPUTS = {
    #   "output_video": "path_to_rendered_video" , this is optional
    # and is only avaialbe when the "render" option is enabled.
    # }
-    Tasks.body_3d_keypoints:
-    [OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
+    Tasks.body_3d_keypoints: [
+        OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
+    ],

    # 3D face reconstruction result for single sample
    # {
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -86,6 +86,10 @@ TASK_INPUTS = {
    InputType.IMAGE,
    Tasks.image_fewshot_detection:
    InputType.IMAGE,
+    Tasks.open_vocabulary_detection: {
+        'img': InputType.IMAGE,
+        'category_names': InputType.TEXT
+    },
    Tasks.image_driving_perception:
    InputType.IMAGE,
    Tasks.vision_efficient_tuning:
--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -97,6 +97,7 @@ if TYPE_CHECKING:
    from .image_structured_model_probing_pipeline import ImageStructuredModelProbingPipeline
    from .video_colorization_pipeline import VideoColorizationPipeline
    from .image_defrcn_fewshot_pipeline import ImageDefrcnDetectionPipeline
+    from .image_open_vocabulary_detection_pipeline import ImageOpenVocabularyDetectionPipeline
    from .object_detection_3d_pipeline import ObjectDetection3DPipeline
    from .ddpm_semantic_segmentation_pipeline import DDPMImageSemanticSegmentationPipeline
    from .image_inpainting_sdv2_pipeline import ImageInpaintingSDV2Pipeline
@@ -244,6 +245,9 @@ else:
        ],
        'video_colorization_pipeline': ['VideoColorizationPipeline'],
        'image_defrcn_fewshot_pipeline': ['ImageDefrcnDetectionPipeline'],
+        'image_open_vocabulary_detection_pipeline': [
+            'ImageOpenVocabularyDetectionPipeline'
+        ],
        'object_detection_3d_pipeline': ['ObjectDetection3DPipeline'],
        'image_inpainting_sdv2_pipeline': ['ImageInpaintingSDV2Pipeline'],
        'image_quality_assessment_mos_pipeline': [
--- a/modelscope/pipelines/cv/image_open_vocabulary_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_open_vocabulary_detection_pipeline.py
@@ -0,0 +1,76 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+from PIL import Image
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.open_vocabulary_detection,
+    module_name=Pipelines.open_vocabulary_detection_vild)
+# @PIPELINES.register_module(
+#     Tasks.image_object_detection, module_name=Pipelines.open_vocabulary_detection)
+class ImageOpenVocabularyDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image open vocabulary detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> vild_pipeline = pipeline(Tasks.open_vocabulary_detection,
+                model='damo/cv_resnet152_open-vocabulary-detection_vild')
+
+            >>> image_path = 'test.jpg'
+            >>> category_names =  ';'.join([
+                    'flipflop', 'street sign', 'bracelet', 'necklace', 'shorts',
+                    'floral camisole', 'orange shirt', 'purple dress', 'yellow tee',
+                    'green umbrella', 'pink striped umbrella', 'transparent umbrella',
+                    'plain pink umbrella', 'blue patterned umbrella', 'koala',
+                    'electric box', 'car', 'pole'
+                    ])
+            >>> input_dict = {'img':image_path, 'category_names':category_names}
+            >>> result = vild_pipeline(input_dict)
+            >>> print(result[OutputKeys.BOXES])
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('open vocabulary detection model, pipeline init')
+
+    def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        # img_path, category_names = input[0], input[1]
+
+        img = LoadImage(mode='rgb')(input['img'])['img']
+        data = {'img': img, 'category_names': input['category_names']}
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.forward(**input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        scores, labels, bboxes = self.model.postprocess(inputs)
+
+        outputs = {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+            OutputKeys.BOXES: bboxes
+        }
+
+        return outputs
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -47,6 +47,7 @@ class CVTasks(object):
    image_object_detection = 'image-object-detection'
    video_object_detection = 'video-object-detection'
    image_fewshot_detection = 'image-fewshot-detection'
+    open_vocabulary_detection = 'open-vocabulary-detection'
    object_detection_3d = 'object-detection-3d'

    image_segmentation = 'image-segmentation'
--- a/tests/pipelines/test_image_open_vocabulary_detection.py
+++ b/tests/pipelines/test_image_open_vocabulary_detection.py
@@ -0,0 +1,83 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_box
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageOpenVocabularyDetectionTest(unittest.TestCase,
+                                       DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        os.system(
+            'pip install tensorflow==2.9.2 -i https://pypi.tuna.tsinghua.edu.cn/simple'
+        )
+        logger.info('upgrade tensorflow finished')
+
+        self.task = Tasks.open_vocabulary_detection
+        self.model_id = 'damo/cv_resnet152_open-vocabulary-detection_vild'
+        self.image = 'data/test/images/image_open_vocabulary_detection.jpg'
+        self.category_names = ';'.join([
+            'flipflop', 'street sign', 'bracelet', 'necklace', 'shorts',
+            'floral camisole', 'orange shirt', 'purple dress', 'yellow tee',
+            'green umbrella', 'pink striped umbrella', 'transparent umbrella',
+            'plain pink umbrella', 'blue patterned umbrella', 'koala',
+            'electric box', 'car', 'pole'
+        ])
+        self.input = {'img': self.image, 'category_names': self.category_names}
+
+    def tearDown(self) -> None:
+        os.system(
+            'pip install tensorflow-gpu==1.15 -i https://pypi.tuna.tsinghua.edu.cn/simple'
+        )
+        logger.info('degrade tensorflow finished')
+        return super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        vild_pipeline = pipeline(task=self.task, model=model)
+        result = vild_pipeline(input=self.input)
+        image = cv2.imread(self.image)
+        draw_box(image, result[OutputKeys.BOXES][0, :])
+        cv2.imwrite('result_modelhub.jpg', image)
+        print('Test run with model from modelhub ok.')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        vild_pipeline = pipeline(task=self.task, model=self.model_id)
+        result = vild_pipeline(self.input)
+        image = cv2.imread(self.image)
+        draw_box(image, result[OutputKeys.BOXES][0, :])
+        cv2.imwrite('result_modelname.jpg', image)
+        print('Test run with model name ok.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        vild_pipeline = pipeline(self.task, model=cache_path)
+        result = vild_pipeline(input=self.input)
+        image = cv2.imread(self.image)
+        draw_box(image, result[OutputKeys.BOXES][0, :])
+        cv2.imwrite('result_snapshot.jpg', image)
+        print('Test run with snapshot ok.')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()