mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 04:01:10 +01:00
add open vocabulary detection
添加了开放词汇目标检测任务和模型 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11548569
This commit is contained in:
3
data/test/images/image_open_vocabulary_detection.jpg
Normal file
3
data/test/images/image_open_vocabulary_detection.jpg
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5b5861ca8955f8ff906abe78f2b32bc49deee2832f4518ffe4bb584653f3c9e9
|
||||
size 187443
|
||||
@@ -91,6 +91,7 @@ class Models(object):
|
||||
image_probing_model = 'image-probing-model'
|
||||
defrcn = 'defrcn'
|
||||
image_face_fusion = 'image-face-fusion'
|
||||
open_vocabulary_detection_vild = 'open-vocabulary-detection-vild'
|
||||
ecbsr = 'ecbsr'
|
||||
msrresnet_lite = 'msrresnet-lite'
|
||||
object_detection_3d = 'object_detection_3d'
|
||||
@@ -346,6 +347,7 @@ class Pipelines(object):
|
||||
image_structured_model_probing = 'image-structured-model-probing'
|
||||
image_fewshot_detection = 'image-fewshot-detection'
|
||||
image_face_fusion = 'image-face-fusion'
|
||||
open_vocabulary_detection_vild = 'open-vocabulary-detection-vild'
|
||||
ddpm_image_semantic_segmentation = 'ddpm-image-semantic-segmentation'
|
||||
video_colorization = 'video-colorization'
|
||||
motion_generattion = 'mdm-motion-generation'
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .vild import OpenVocabularyDetectionViLD
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'vild': ['OpenVocabularyDetectionViLD'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
390
modelscope/models/cv/open_vocabulary_detection_vild/vild.py
Normal file
390
modelscope/models/cv/open_vocabulary_detection_vild/vild.py
Normal file
@@ -0,0 +1,390 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
import os
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import clip
|
||||
import numpy as np
|
||||
import tensorflow.compat.v1 as tf
|
||||
import torch.cuda
|
||||
from scipy.special import softmax
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import Tensor
|
||||
from modelscope.models.base.base_model import Model
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.open_vocabulary_detection,
|
||||
module_name=Models.open_vocabulary_detection_vild)
|
||||
class OpenVocabularyDetectionViLD(Model):
|
||||
"""
|
||||
Vild: Open-Vocabulary Detection via Vision and Language Knowledge Distillation
|
||||
https://arxiv.org/abs/2104.13921
|
||||
"""
|
||||
|
||||
def __init__(self, model_dir, *args, **kwargs):
|
||||
self.model_dir = model_dir
|
||||
device_name = kwargs.get('device', 'gpu')
|
||||
self._device_name = device_name
|
||||
|
||||
model_path = os.path.join(model_dir, ModelFile.TF_GRAPH_FILE)
|
||||
# model_path = os.path.join(model_dir, 'test_out.pb')
|
||||
graph = tf.Graph()
|
||||
with graph.as_default():
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.per_process_gpu_memory_fraction = 0.2
|
||||
compute_graph = tf.Graph()
|
||||
compute_graph.as_default()
|
||||
sess = tf.Session(config=config)
|
||||
|
||||
with tf.gfile.GFile(model_path, 'rb') as fid:
|
||||
graph_def = tf.GraphDef()
|
||||
graph_def.ParseFromString(fid.read())
|
||||
tf.import_graph_def(graph_def, name='')
|
||||
self.sess = sess
|
||||
|
||||
#
|
||||
# clip.available_models()
|
||||
self.clip, self.clip_preprocess = clip.load(
|
||||
'ViT-B/32', device='cuda:0')
|
||||
|
||||
self.prompt_engineering = True
|
||||
self.this_is = True
|
||||
self.temperature = 100.0
|
||||
self.use_softmax = False
|
||||
self.out_name = [
|
||||
'RoiBoxes:0', 'RoiScores:0', '2ndStageBoxes:0',
|
||||
'2ndStageScoresUnused:0', 'BoxOutputs:0', 'MaskOutputs:0',
|
||||
'VisualFeatOutputs:0', 'ImageInfo:0'
|
||||
]
|
||||
|
||||
def __call__(self, *args, **kwargs) -> Dict[str, Any]:
|
||||
return self.postprocess(self.forward(*args, **kwargs))
|
||||
|
||||
def forward(self, img: np.array, category_names: str,
|
||||
**kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the forward pass for a model.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: output from the model forward pass
|
||||
"""
|
||||
(roi_boxes, roi_scores, detection_boxes, scores_unused, box_outputs,
|
||||
detection_masks, visual_features, image_info) = self.sess.run(
|
||||
self.out_name, feed_dict={'Placeholder:0': img})
|
||||
return_dict = {
|
||||
'roi_boxes': roi_boxes,
|
||||
'roi_scores': roi_scores,
|
||||
'detection_boxes': detection_boxes,
|
||||
'scores_unused': scores_unused,
|
||||
'box_outputs': box_outputs,
|
||||
'detection_masks': detection_masks,
|
||||
'visual_features': visual_features,
|
||||
'image_info': image_info,
|
||||
'category_names': category_names
|
||||
}
|
||||
return return_dict
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
||||
""" Model specific postprocess and convert model output to
|
||||
standard model outputs.
|
||||
|
||||
Args:
|
||||
inputs: input data
|
||||
|
||||
Return:
|
||||
dict of results: a dict containing outputs of model, each
|
||||
output should have the standard output name.
|
||||
"""
|
||||
max_boxes_to_return = 25
|
||||
nms_threshold = 0.6
|
||||
min_rpn_score_thresh = 0.9
|
||||
min_box_area = 220
|
||||
|
||||
roi_boxes = inputs['roi_boxes']
|
||||
roi_scores = inputs['roi_scores']
|
||||
detection_boxes = inputs['detection_boxes']
|
||||
scores_unused = inputs['scores_unused']
|
||||
box_outputs = inputs['box_outputs']
|
||||
detection_masks = inputs['detection_masks']
|
||||
visual_features = inputs['visual_features']
|
||||
image_info = inputs['image_info']
|
||||
category_names = inputs['category_names']
|
||||
|
||||
#################################################################
|
||||
# Preprocessing categories and get params
|
||||
category_names = [x.strip() for x in category_names.split(';')]
|
||||
category_names = ['background'] + category_names
|
||||
categories = [{
|
||||
'name': item,
|
||||
'id': idx + 1,
|
||||
} for idx, item in enumerate(category_names)]
|
||||
# category_indices = {cat['id']: cat for cat in categories}
|
||||
|
||||
#################################################################
|
||||
# Obtain results and read image
|
||||
|
||||
roi_boxes = np.squeeze(roi_boxes, axis=0) # squeeze
|
||||
# no need to clip the boxes, already done
|
||||
roi_scores = np.squeeze(roi_scores, axis=0)
|
||||
|
||||
detection_boxes = np.squeeze(detection_boxes, axis=(0, 2))
|
||||
scores_unused = np.squeeze(scores_unused, axis=0)
|
||||
box_outputs = np.squeeze(box_outputs, axis=0)
|
||||
detection_masks = np.squeeze(detection_masks, axis=0)
|
||||
visual_features = np.squeeze(visual_features, axis=0)
|
||||
|
||||
# obtain image info
|
||||
image_info = np.squeeze(image_info, axis=0)
|
||||
image_scale = np.tile(image_info[2:3, :], (1, 2))
|
||||
|
||||
# rescale
|
||||
rescaled_detection_boxes = detection_boxes / image_scale
|
||||
|
||||
#################################################################
|
||||
# Filter boxes
|
||||
|
||||
# Apply non-maximum suppression to detected boxes with nms threshold.
|
||||
nmsed_indices = nms(detection_boxes, roi_scores, thresh=nms_threshold)
|
||||
|
||||
# Compute RPN box size.
|
||||
box_sizes = (rescaled_detection_boxes[:, 2]
|
||||
- rescaled_detection_boxes[:, 0]) * (
|
||||
rescaled_detection_boxes[:, 3]
|
||||
- rescaled_detection_boxes[:, 1])
|
||||
|
||||
# Filter out invalid rois (nmsed rois)
|
||||
valid_indices = np.where(
|
||||
np.logical_and(
|
||||
np.isin(
|
||||
np.arange(len(roi_scores), dtype=np.int), nmsed_indices),
|
||||
np.logical_and(
|
||||
np.logical_not(np.all(roi_boxes == 0., axis=-1)),
|
||||
np.logical_and(roi_scores >= min_rpn_score_thresh,
|
||||
box_sizes > min_box_area))))[0]
|
||||
# print('number of valid indices', len(valid_indices))
|
||||
|
||||
# detection_roi_scores = roi_scores[valid_indices][:max_boxes_to_return,
|
||||
# ...]
|
||||
detection_boxes = detection_boxes[valid_indices][:max_boxes_to_return,
|
||||
...]
|
||||
detection_masks = detection_masks[valid_indices][:max_boxes_to_return,
|
||||
...]
|
||||
detection_visual_feat = visual_features[
|
||||
valid_indices][:max_boxes_to_return, ...]
|
||||
rescaled_detection_boxes = rescaled_detection_boxes[
|
||||
valid_indices][:max_boxes_to_return, ...]
|
||||
|
||||
#################################################################
|
||||
# Compute text embeddings and detection scores, and rank results
|
||||
text_features = self._build_text_embedings(categories)
|
||||
|
||||
raw_scores = detection_visual_feat.dot(text_features.T)
|
||||
if self.use_softmax:
|
||||
scores_all = softmax(self.temperature * raw_scores, axis=-1)
|
||||
else:
|
||||
scores_all = raw_scores
|
||||
|
||||
# Results are ranked by scores
|
||||
indices = np.argsort(-np.max(scores_all, axis=1))
|
||||
# indices_fg = np.array(
|
||||
# [i for i in indices if np.argmax(scores_all[i]) != 0])
|
||||
|
||||
#################################################################
|
||||
# Plot detected boxes on the input image.
|
||||
ymin, xmin, ymax, xmax = np.split(rescaled_detection_boxes, 4, axis=-1)
|
||||
processed_boxes = np.concatenate([xmin, ymin, xmax, ymax], axis=-1)
|
||||
|
||||
n_boxes = processed_boxes.shape[0]
|
||||
# print(rescaled_detection_boxes)
|
||||
|
||||
categories = []
|
||||
bboxes = []
|
||||
scores = []
|
||||
labels = []
|
||||
|
||||
for anno_idx in indices[0:int(n_boxes)]:
|
||||
anno_bbox = processed_boxes[anno_idx]
|
||||
anno_scores = scores_all[anno_idx]
|
||||
|
||||
if np.argmax(anno_scores) == 0:
|
||||
continue
|
||||
bboxes.append(anno_bbox)
|
||||
scores.append(anno_scores[1:])
|
||||
categories.append(category_names[1:])
|
||||
labels.append(np.argmax(anno_scores) - 1)
|
||||
bboxes = np.vstack(bboxes)
|
||||
scores = np.vstack(scores)
|
||||
|
||||
return scores, categories, bboxes
|
||||
|
||||
def _build_text_embedings(self, categories):
|
||||
|
||||
def processed_name(name, rm_dot=False):
|
||||
# _ for lvis
|
||||
# / for obj365
|
||||
res = name.replace('_', ' ').replace('/', ' or ').lower()
|
||||
if rm_dot:
|
||||
res = res.rstrip('.')
|
||||
return res
|
||||
|
||||
def article(name):
|
||||
return 'an' if name[0] in 'aeiou' else 'a'
|
||||
|
||||
templates = multiple_templates
|
||||
|
||||
run_on_gpu = torch.cuda.is_available()
|
||||
|
||||
with torch.no_grad():
|
||||
all_text_embeddings = []
|
||||
# print('Building text embeddings...')
|
||||
for category in categories:
|
||||
texts = [
|
||||
template.format(
|
||||
processed_name(category['name'], rm_dot=True),
|
||||
article=article(category['name']))
|
||||
for template in templates
|
||||
]
|
||||
if self.this_is:
|
||||
texts = [
|
||||
'This is ' + text if text.startswith('a')
|
||||
or text.startswith('the') else text for text in texts
|
||||
]
|
||||
# tokenize
|
||||
texts = clip.tokenize(texts)
|
||||
if run_on_gpu:
|
||||
texts = texts.cuda()
|
||||
# embed with text encoder
|
||||
text_embeddings = self.clip.encode_text(texts)
|
||||
text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
|
||||
text_embedding = text_embeddings.mean(dim=0)
|
||||
text_embedding /= text_embedding.norm()
|
||||
all_text_embeddings.append(text_embedding)
|
||||
all_text_embeddings = torch.stack(all_text_embeddings, dim=1)
|
||||
if run_on_gpu:
|
||||
all_text_embeddings = all_text_embeddings.cuda()
|
||||
return all_text_embeddings.cpu().numpy().T
|
||||
|
||||
|
||||
multiple_templates = [
|
||||
'There is {article} {} in the scene.',
|
||||
'There is the {} in the scene.',
|
||||
'a photo of {article} {} in the scene.',
|
||||
'a photo of the {} in the scene.',
|
||||
'a photo of one {} in the scene.',
|
||||
'itap of {article} {}.',
|
||||
'itap of my {}.', # itap: I took a picture of
|
||||
'itap of the {}.',
|
||||
'a photo of {article} {}.',
|
||||
'a photo of my {}.',
|
||||
'a photo of the {}.',
|
||||
'a photo of one {}.',
|
||||
'a photo of many {}.',
|
||||
'a good photo of {article} {}.',
|
||||
'a good photo of the {}.',
|
||||
'a bad photo of {article} {}.',
|
||||
'a bad photo of the {}.',
|
||||
'a photo of a nice {}.',
|
||||
'a photo of the nice {}.',
|
||||
'a photo of a cool {}.',
|
||||
'a photo of the cool {}.',
|
||||
'a photo of a weird {}.',
|
||||
'a photo of the weird {}.',
|
||||
'a photo of a small {}.',
|
||||
'a photo of the small {}.',
|
||||
'a photo of a large {}.',
|
||||
'a photo of the large {}.',
|
||||
'a photo of a clean {}.',
|
||||
'a photo of the clean {}.',
|
||||
'a photo of a dirty {}.',
|
||||
'a photo of the dirty {}.',
|
||||
'a bright photo of {article} {}.',
|
||||
'a bright photo of the {}.',
|
||||
'a dark photo of {article} {}.',
|
||||
'a dark photo of the {}.',
|
||||
'a photo of a hard to see {}.',
|
||||
'a photo of the hard to see {}.',
|
||||
'a low resolution photo of {article} {}.',
|
||||
'a low resolution photo of the {}.',
|
||||
'a cropped photo of {article} {}.',
|
||||
'a cropped photo of the {}.',
|
||||
'a close-up photo of {article} {}.',
|
||||
'a close-up photo of the {}.',
|
||||
'a jpeg corrupted photo of {article} {}.',
|
||||
'a jpeg corrupted photo of the {}.',
|
||||
'a blurry photo of {article} {}.',
|
||||
'a blurry photo of the {}.',
|
||||
'a pixelated photo of {article} {}.',
|
||||
'a pixelated photo of the {}.',
|
||||
'a black and white photo of the {}.',
|
||||
'a black and white photo of {article} {}.',
|
||||
'a plastic {}.',
|
||||
'the plastic {}.',
|
||||
'a toy {}.',
|
||||
'the toy {}.',
|
||||
'a plushie {}.',
|
||||
'the plushie {}.',
|
||||
'a cartoon {}.',
|
||||
'the cartoon {}.',
|
||||
'an embroidered {}.',
|
||||
'the embroidered {}.',
|
||||
'a painting of the {}.',
|
||||
'a painting of a {}.',
|
||||
]
|
||||
|
||||
|
||||
def nms(dets, scores, thresh, max_dets=1000):
|
||||
"""Non-maximum suppression.
|
||||
Args:
|
||||
dets: [N, 4]
|
||||
scores: [N,]
|
||||
thresh: iou threshold. Float
|
||||
max_dets: int.
|
||||
"""
|
||||
y1 = dets[:, 0]
|
||||
x1 = dets[:, 1]
|
||||
y2 = dets[:, 2]
|
||||
x2 = dets[:, 3]
|
||||
|
||||
areas = (x2 - x1) * (y2 - y1)
|
||||
order = scores.argsort()[::-1]
|
||||
|
||||
keep = []
|
||||
while order.size > 0 and len(keep) < max_dets:
|
||||
i = order[0]
|
||||
keep.append(i)
|
||||
|
||||
xx1 = np.maximum(x1[i], x1[order[1:]])
|
||||
yy1 = np.maximum(y1[i], y1[order[1:]])
|
||||
xx2 = np.minimum(x2[i], x2[order[1:]])
|
||||
yy2 = np.minimum(y2[i], y2[order[1:]])
|
||||
|
||||
w = np.maximum(0.0, xx2 - xx1)
|
||||
h = np.maximum(0.0, yy2 - yy1)
|
||||
intersection = w * h
|
||||
overlap = intersection / (
|
||||
areas[i] + areas[order[1:]] - intersection + 1e-12)
|
||||
|
||||
inds = np.where(overlap <= thresh)[0]
|
||||
order = order[inds + 1]
|
||||
return keep
|
||||
@@ -249,6 +249,8 @@ TASK_OUTPUTS = {
|
||||
[OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
|
||||
Tasks.domain_specific_object_detection:
|
||||
[OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
|
||||
Tasks.open_vocabulary_detection:
|
||||
[OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
|
||||
|
||||
# video object detection result for single sample
|
||||
# {
|
||||
@@ -419,8 +421,9 @@ TASK_OUTPUTS = {
|
||||
# "output_video": "path_to_rendered_video" , this is optional
|
||||
# and is only avaialbe when the "render" option is enabled.
|
||||
# }
|
||||
Tasks.body_3d_keypoints:
|
||||
[OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
|
||||
Tasks.body_3d_keypoints: [
|
||||
OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
|
||||
],
|
||||
|
||||
# 3D face reconstruction result for single sample
|
||||
# {
|
||||
|
||||
@@ -86,6 +86,10 @@ TASK_INPUTS = {
|
||||
InputType.IMAGE,
|
||||
Tasks.image_fewshot_detection:
|
||||
InputType.IMAGE,
|
||||
Tasks.open_vocabulary_detection: {
|
||||
'img': InputType.IMAGE,
|
||||
'category_names': InputType.TEXT
|
||||
},
|
||||
Tasks.image_driving_perception:
|
||||
InputType.IMAGE,
|
||||
Tasks.vision_efficient_tuning:
|
||||
|
||||
@@ -97,6 +97,7 @@ if TYPE_CHECKING:
|
||||
from .image_structured_model_probing_pipeline import ImageStructuredModelProbingPipeline
|
||||
from .video_colorization_pipeline import VideoColorizationPipeline
|
||||
from .image_defrcn_fewshot_pipeline import ImageDefrcnDetectionPipeline
|
||||
from .image_open_vocabulary_detection_pipeline import ImageOpenVocabularyDetectionPipeline
|
||||
from .object_detection_3d_pipeline import ObjectDetection3DPipeline
|
||||
from .ddpm_semantic_segmentation_pipeline import DDPMImageSemanticSegmentationPipeline
|
||||
from .image_inpainting_sdv2_pipeline import ImageInpaintingSDV2Pipeline
|
||||
@@ -244,6 +245,9 @@ else:
|
||||
],
|
||||
'video_colorization_pipeline': ['VideoColorizationPipeline'],
|
||||
'image_defrcn_fewshot_pipeline': ['ImageDefrcnDetectionPipeline'],
|
||||
'image_open_vocabulary_detection_pipeline': [
|
||||
'ImageOpenVocabularyDetectionPipeline'
|
||||
],
|
||||
'object_detection_3d_pipeline': ['ObjectDetection3DPipeline'],
|
||||
'image_inpainting_sdv2_pipeline': ['ImageInpaintingSDV2Pipeline'],
|
||||
'image_quality_assessment_mos_pipeline': [
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import PIL
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from modelscope.metainfo import Pipelines
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.pipelines.base import Input, Model, Pipeline
|
||||
from modelscope.pipelines.builder import PIPELINES
|
||||
from modelscope.preprocessors import LoadImage
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
@PIPELINES.register_module(
|
||||
Tasks.open_vocabulary_detection,
|
||||
module_name=Pipelines.open_vocabulary_detection_vild)
|
||||
# @PIPELINES.register_module(
|
||||
# Tasks.image_object_detection, module_name=Pipelines.open_vocabulary_detection)
|
||||
class ImageOpenVocabularyDetectionPipeline(Pipeline):
|
||||
|
||||
def __init__(self, model: str, **kwargs):
|
||||
"""
|
||||
use `model` to create a image open vocabulary detection pipeline for prediction
|
||||
Args:
|
||||
model: model id on modelscope hub.
|
||||
Example:
|
||||
>>> from modelscope.pipelines import pipeline
|
||||
>>> vild_pipeline = pipeline(Tasks.open_vocabulary_detection,
|
||||
model='damo/cv_resnet152_open-vocabulary-detection_vild')
|
||||
|
||||
>>> image_path = 'test.jpg'
|
||||
>>> category_names = ';'.join([
|
||||
'flipflop', 'street sign', 'bracelet', 'necklace', 'shorts',
|
||||
'floral camisole', 'orange shirt', 'purple dress', 'yellow tee',
|
||||
'green umbrella', 'pink striped umbrella', 'transparent umbrella',
|
||||
'plain pink umbrella', 'blue patterned umbrella', 'koala',
|
||||
'electric box', 'car', 'pole'
|
||||
])
|
||||
>>> input_dict = {'img':image_path, 'category_names':category_names}
|
||||
>>> result = vild_pipeline(input_dict)
|
||||
>>> print(result[OutputKeys.BOXES])
|
||||
"""
|
||||
super().__init__(model=model, **kwargs)
|
||||
|
||||
logger.info('open vocabulary detection model, pipeline init')
|
||||
|
||||
def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||
# img_path, category_names = input[0], input[1]
|
||||
|
||||
img = LoadImage(mode='rgb')(input['img'])['img']
|
||||
data = {'img': img, 'category_names': input['category_names']}
|
||||
|
||||
return data
|
||||
|
||||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||
results = self.model.forward(**input)
|
||||
return results
|
||||
|
||||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
scores, labels, bboxes = self.model.postprocess(inputs)
|
||||
|
||||
outputs = {
|
||||
OutputKeys.SCORES: scores,
|
||||
OutputKeys.LABELS: labels,
|
||||
OutputKeys.BOXES: bboxes
|
||||
}
|
||||
|
||||
return outputs
|
||||
@@ -47,6 +47,7 @@ class CVTasks(object):
|
||||
image_object_detection = 'image-object-detection'
|
||||
video_object_detection = 'video-object-detection'
|
||||
image_fewshot_detection = 'image-fewshot-detection'
|
||||
open_vocabulary_detection = 'open-vocabulary-detection'
|
||||
object_detection_3d = 'object-detection-3d'
|
||||
|
||||
image_segmentation = 'image-segmentation'
|
||||
|
||||
83
tests/pipelines/test_image_open_vocabulary_detection.py
Normal file
83
tests/pipelines/test_image_open_vocabulary_detection.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import cv2
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.models import Model
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.cv.image_utils import draw_box
|
||||
from modelscope.utils.demo_utils import DemoCompatibilityCheck
|
||||
from modelscope.utils.logger import get_logger
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class ImageOpenVocabularyDetectionTest(unittest.TestCase,
|
||||
DemoCompatibilityCheck):
|
||||
|
||||
def setUp(self) -> None:
|
||||
os.system(
|
||||
'pip install tensorflow==2.9.2 -i https://pypi.tuna.tsinghua.edu.cn/simple'
|
||||
)
|
||||
logger.info('upgrade tensorflow finished')
|
||||
|
||||
self.task = Tasks.open_vocabulary_detection
|
||||
self.model_id = 'damo/cv_resnet152_open-vocabulary-detection_vild'
|
||||
self.image = 'data/test/images/image_open_vocabulary_detection.jpg'
|
||||
self.category_names = ';'.join([
|
||||
'flipflop', 'street sign', 'bracelet', 'necklace', 'shorts',
|
||||
'floral camisole', 'orange shirt', 'purple dress', 'yellow tee',
|
||||
'green umbrella', 'pink striped umbrella', 'transparent umbrella',
|
||||
'plain pink umbrella', 'blue patterned umbrella', 'koala',
|
||||
'electric box', 'car', 'pole'
|
||||
])
|
||||
self.input = {'img': self.image, 'category_names': self.category_names}
|
||||
|
||||
def tearDown(self) -> None:
|
||||
os.system(
|
||||
'pip install tensorflow-gpu==1.15 -i https://pypi.tuna.tsinghua.edu.cn/simple'
|
||||
)
|
||||
logger.info('degrade tensorflow finished')
|
||||
return super().tearDown()
|
||||
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_with_model_from_modelhub(self):
|
||||
model = Model.from_pretrained(self.model_id)
|
||||
vild_pipeline = pipeline(task=self.task, model=model)
|
||||
result = vild_pipeline(input=self.input)
|
||||
image = cv2.imread(self.image)
|
||||
draw_box(image, result[OutputKeys.BOXES][0, :])
|
||||
cv2.imwrite('result_modelhub.jpg', image)
|
||||
print('Test run with model from modelhub ok.')
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_run_with_model_name(self):
|
||||
vild_pipeline = pipeline(task=self.task, model=self.model_id)
|
||||
result = vild_pipeline(self.input)
|
||||
image = cv2.imread(self.image)
|
||||
draw_box(image, result[OutputKeys.BOXES][0, :])
|
||||
cv2.imwrite('result_modelname.jpg', image)
|
||||
print('Test run with model name ok.')
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run_by_direct_model_download(self):
|
||||
cache_path = snapshot_download(self.model_id)
|
||||
vild_pipeline = pipeline(self.task, model=cache_path)
|
||||
result = vild_pipeline(input=self.input)
|
||||
image = cv2.imread(self.image)
|
||||
draw_box(image, result[OutputKeys.BOXES][0, :])
|
||||
cv2.imwrite('result_snapshot.jpg', image)
|
||||
print('Test run with snapshot ok.')
|
||||
|
||||
@unittest.skip('demo compatibility test is only enabled on a needed-basis')
|
||||
def test_demo_compatibility(self):
|
||||
self.compatibility_check()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user