add open vocabulary detection

添加了开放词汇目标检测任务和模型

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11548569
This commit is contained in:
qize.yqz
2023-02-10 06:37:51 +00:00
committed by wenmeng.zwm
parent ca1321f53f
commit 5e88cfe693
10 changed files with 590 additions and 2 deletions

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5b5861ca8955f8ff906abe78f2b32bc49deee2832f4518ffe4bb584653f3c9e9
size 187443

View File

@@ -91,6 +91,7 @@ class Models(object):
image_probing_model = 'image-probing-model'
defrcn = 'defrcn'
image_face_fusion = 'image-face-fusion'
open_vocabulary_detection_vild = 'open-vocabulary-detection-vild'
ecbsr = 'ecbsr'
msrresnet_lite = 'msrresnet-lite'
object_detection_3d = 'object_detection_3d'
@@ -346,6 +347,7 @@ class Pipelines(object):
image_structured_model_probing = 'image-structured-model-probing'
image_fewshot_detection = 'image-fewshot-detection'
image_face_fusion = 'image-face-fusion'
open_vocabulary_detection_vild = 'open-vocabulary-detection-vild'
ddpm_image_semantic_segmentation = 'ddpm-image-semantic-segmentation'
video_colorization = 'video-colorization'
motion_generattion = 'mdm-motion-generation'

View File

@@ -0,0 +1,22 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .vild import OpenVocabularyDetectionViLD
else:
_import_structure = {
'vild': ['OpenVocabularyDetectionViLD'],
}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -0,0 +1,390 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
from typing import Any, Dict, Union
import clip
import numpy as np
import tensorflow.compat.v1 as tf
import torch.cuda
from scipy.special import softmax
from modelscope.metainfo import Models
from modelscope.models.base import Tensor
from modelscope.models.base.base_model import Model
from modelscope.models.builder import MODELS
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
logger = get_logger()
@MODELS.register_module(
Tasks.open_vocabulary_detection,
module_name=Models.open_vocabulary_detection_vild)
class OpenVocabularyDetectionViLD(Model):
"""
Vild: Open-Vocabulary Detection via Vision and Language Knowledge Distillation
https://arxiv.org/abs/2104.13921
"""
def __init__(self, model_dir, *args, **kwargs):
self.model_dir = model_dir
device_name = kwargs.get('device', 'gpu')
self._device_name = device_name
model_path = os.path.join(model_dir, ModelFile.TF_GRAPH_FILE)
# model_path = os.path.join(model_dir, 'test_out.pb')
graph = tf.Graph()
with graph.as_default():
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.2
compute_graph = tf.Graph()
compute_graph.as_default()
sess = tf.Session(config=config)
with tf.gfile.GFile(model_path, 'rb') as fid:
graph_def = tf.GraphDef()
graph_def.ParseFromString(fid.read())
tf.import_graph_def(graph_def, name='')
self.sess = sess
#
# clip.available_models()
self.clip, self.clip_preprocess = clip.load(
'ViT-B/32', device='cuda:0')
self.prompt_engineering = True
self.this_is = True
self.temperature = 100.0
self.use_softmax = False
self.out_name = [
'RoiBoxes:0', 'RoiScores:0', '2ndStageBoxes:0',
'2ndStageScoresUnused:0', 'BoxOutputs:0', 'MaskOutputs:0',
'VisualFeatOutputs:0', 'ImageInfo:0'
]
def __call__(self, *args, **kwargs) -> Dict[str, Any]:
return self.postprocess(self.forward(*args, **kwargs))
def forward(self, img: np.array, category_names: str,
**kwargs) -> Dict[str, Any]:
"""
Run the forward pass for a model.
Returns:
Dict[str, Any]: output from the model forward pass
"""
(roi_boxes, roi_scores, detection_boxes, scores_unused, box_outputs,
detection_masks, visual_features, image_info) = self.sess.run(
self.out_name, feed_dict={'Placeholder:0': img})
return_dict = {
'roi_boxes': roi_boxes,
'roi_scores': roi_scores,
'detection_boxes': detection_boxes,
'scores_unused': scores_unused,
'box_outputs': box_outputs,
'detection_masks': detection_masks,
'visual_features': visual_features,
'image_info': image_info,
'category_names': category_names
}
return return_dict
def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
""" Model specific postprocess and convert model output to
standard model outputs.
Args:
inputs: input data
Return:
dict of results: a dict containing outputs of model, each
output should have the standard output name.
"""
max_boxes_to_return = 25
nms_threshold = 0.6
min_rpn_score_thresh = 0.9
min_box_area = 220
roi_boxes = inputs['roi_boxes']
roi_scores = inputs['roi_scores']
detection_boxes = inputs['detection_boxes']
scores_unused = inputs['scores_unused']
box_outputs = inputs['box_outputs']
detection_masks = inputs['detection_masks']
visual_features = inputs['visual_features']
image_info = inputs['image_info']
category_names = inputs['category_names']
#################################################################
# Preprocessing categories and get params
category_names = [x.strip() for x in category_names.split(';')]
category_names = ['background'] + category_names
categories = [{
'name': item,
'id': idx + 1,
} for idx, item in enumerate(category_names)]
# category_indices = {cat['id']: cat for cat in categories}
#################################################################
# Obtain results and read image
roi_boxes = np.squeeze(roi_boxes, axis=0) # squeeze
# no need to clip the boxes, already done
roi_scores = np.squeeze(roi_scores, axis=0)
detection_boxes = np.squeeze(detection_boxes, axis=(0, 2))
scores_unused = np.squeeze(scores_unused, axis=0)
box_outputs = np.squeeze(box_outputs, axis=0)
detection_masks = np.squeeze(detection_masks, axis=0)
visual_features = np.squeeze(visual_features, axis=0)
# obtain image info
image_info = np.squeeze(image_info, axis=0)
image_scale = np.tile(image_info[2:3, :], (1, 2))
# rescale
rescaled_detection_boxes = detection_boxes / image_scale
#################################################################
# Filter boxes
# Apply non-maximum suppression to detected boxes with nms threshold.
nmsed_indices = nms(detection_boxes, roi_scores, thresh=nms_threshold)
# Compute RPN box size.
box_sizes = (rescaled_detection_boxes[:, 2]
- rescaled_detection_boxes[:, 0]) * (
rescaled_detection_boxes[:, 3]
- rescaled_detection_boxes[:, 1])
# Filter out invalid rois (nmsed rois)
valid_indices = np.where(
np.logical_and(
np.isin(
np.arange(len(roi_scores), dtype=np.int), nmsed_indices),
np.logical_and(
np.logical_not(np.all(roi_boxes == 0., axis=-1)),
np.logical_and(roi_scores >= min_rpn_score_thresh,
box_sizes > min_box_area))))[0]
# print('number of valid indices', len(valid_indices))
# detection_roi_scores = roi_scores[valid_indices][:max_boxes_to_return,
# ...]
detection_boxes = detection_boxes[valid_indices][:max_boxes_to_return,
...]
detection_masks = detection_masks[valid_indices][:max_boxes_to_return,
...]
detection_visual_feat = visual_features[
valid_indices][:max_boxes_to_return, ...]
rescaled_detection_boxes = rescaled_detection_boxes[
valid_indices][:max_boxes_to_return, ...]
#################################################################
# Compute text embeddings and detection scores, and rank results
text_features = self._build_text_embedings(categories)
raw_scores = detection_visual_feat.dot(text_features.T)
if self.use_softmax:
scores_all = softmax(self.temperature * raw_scores, axis=-1)
else:
scores_all = raw_scores
# Results are ranked by scores
indices = np.argsort(-np.max(scores_all, axis=1))
# indices_fg = np.array(
# [i for i in indices if np.argmax(scores_all[i]) != 0])
#################################################################
# Plot detected boxes on the input image.
ymin, xmin, ymax, xmax = np.split(rescaled_detection_boxes, 4, axis=-1)
processed_boxes = np.concatenate([xmin, ymin, xmax, ymax], axis=-1)
n_boxes = processed_boxes.shape[0]
# print(rescaled_detection_boxes)
categories = []
bboxes = []
scores = []
labels = []
for anno_idx in indices[0:int(n_boxes)]:
anno_bbox = processed_boxes[anno_idx]
anno_scores = scores_all[anno_idx]
if np.argmax(anno_scores) == 0:
continue
bboxes.append(anno_bbox)
scores.append(anno_scores[1:])
categories.append(category_names[1:])
labels.append(np.argmax(anno_scores) - 1)
bboxes = np.vstack(bboxes)
scores = np.vstack(scores)
return scores, categories, bboxes
def _build_text_embedings(self, categories):
def processed_name(name, rm_dot=False):
# _ for lvis
# / for obj365
res = name.replace('_', ' ').replace('/', ' or ').lower()
if rm_dot:
res = res.rstrip('.')
return res
def article(name):
return 'an' if name[0] in 'aeiou' else 'a'
templates = multiple_templates
run_on_gpu = torch.cuda.is_available()
with torch.no_grad():
all_text_embeddings = []
# print('Building text embeddings...')
for category in categories:
texts = [
template.format(
processed_name(category['name'], rm_dot=True),
article=article(category['name']))
for template in templates
]
if self.this_is:
texts = [
'This is ' + text if text.startswith('a')
or text.startswith('the') else text for text in texts
]
# tokenize
texts = clip.tokenize(texts)
if run_on_gpu:
texts = texts.cuda()
# embed with text encoder
text_embeddings = self.clip.encode_text(texts)
text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
text_embedding = text_embeddings.mean(dim=0)
text_embedding /= text_embedding.norm()
all_text_embeddings.append(text_embedding)
all_text_embeddings = torch.stack(all_text_embeddings, dim=1)
if run_on_gpu:
all_text_embeddings = all_text_embeddings.cuda()
return all_text_embeddings.cpu().numpy().T
multiple_templates = [
'There is {article} {} in the scene.',
'There is the {} in the scene.',
'a photo of {article} {} in the scene.',
'a photo of the {} in the scene.',
'a photo of one {} in the scene.',
'itap of {article} {}.',
'itap of my {}.', # itap: I took a picture of
'itap of the {}.',
'a photo of {article} {}.',
'a photo of my {}.',
'a photo of the {}.',
'a photo of one {}.',
'a photo of many {}.',
'a good photo of {article} {}.',
'a good photo of the {}.',
'a bad photo of {article} {}.',
'a bad photo of the {}.',
'a photo of a nice {}.',
'a photo of the nice {}.',
'a photo of a cool {}.',
'a photo of the cool {}.',
'a photo of a weird {}.',
'a photo of the weird {}.',
'a photo of a small {}.',
'a photo of the small {}.',
'a photo of a large {}.',
'a photo of the large {}.',
'a photo of a clean {}.',
'a photo of the clean {}.',
'a photo of a dirty {}.',
'a photo of the dirty {}.',
'a bright photo of {article} {}.',
'a bright photo of the {}.',
'a dark photo of {article} {}.',
'a dark photo of the {}.',
'a photo of a hard to see {}.',
'a photo of the hard to see {}.',
'a low resolution photo of {article} {}.',
'a low resolution photo of the {}.',
'a cropped photo of {article} {}.',
'a cropped photo of the {}.',
'a close-up photo of {article} {}.',
'a close-up photo of the {}.',
'a jpeg corrupted photo of {article} {}.',
'a jpeg corrupted photo of the {}.',
'a blurry photo of {article} {}.',
'a blurry photo of the {}.',
'a pixelated photo of {article} {}.',
'a pixelated photo of the {}.',
'a black and white photo of the {}.',
'a black and white photo of {article} {}.',
'a plastic {}.',
'the plastic {}.',
'a toy {}.',
'the toy {}.',
'a plushie {}.',
'the plushie {}.',
'a cartoon {}.',
'the cartoon {}.',
'an embroidered {}.',
'the embroidered {}.',
'a painting of the {}.',
'a painting of a {}.',
]
def nms(dets, scores, thresh, max_dets=1000):
"""Non-maximum suppression.
Args:
dets: [N, 4]
scores: [N,]
thresh: iou threshold. Float
max_dets: int.
"""
y1 = dets[:, 0]
x1 = dets[:, 1]
y2 = dets[:, 2]
x2 = dets[:, 3]
areas = (x2 - x1) * (y2 - y1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0 and len(keep) < max_dets:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1)
h = np.maximum(0.0, yy2 - yy1)
intersection = w * h
overlap = intersection / (
areas[i] + areas[order[1:]] - intersection + 1e-12)
inds = np.where(overlap <= thresh)[0]
order = order[inds + 1]
return keep

View File

@@ -249,6 +249,8 @@ TASK_OUTPUTS = {
[OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
Tasks.domain_specific_object_detection:
[OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
Tasks.open_vocabulary_detection:
[OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
# video object detection result for single sample
# {
@@ -419,8 +421,9 @@ TASK_OUTPUTS = {
# "output_video": "path_to_rendered_video" , this is optional
# and is only avaialbe when the "render" option is enabled.
# }
Tasks.body_3d_keypoints:
[OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
Tasks.body_3d_keypoints: [
OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
],
# 3D face reconstruction result for single sample
# {

View File

@@ -86,6 +86,10 @@ TASK_INPUTS = {
InputType.IMAGE,
Tasks.image_fewshot_detection:
InputType.IMAGE,
Tasks.open_vocabulary_detection: {
'img': InputType.IMAGE,
'category_names': InputType.TEXT
},
Tasks.image_driving_perception:
InputType.IMAGE,
Tasks.vision_efficient_tuning:

View File

@@ -97,6 +97,7 @@ if TYPE_CHECKING:
from .image_structured_model_probing_pipeline import ImageStructuredModelProbingPipeline
from .video_colorization_pipeline import VideoColorizationPipeline
from .image_defrcn_fewshot_pipeline import ImageDefrcnDetectionPipeline
from .image_open_vocabulary_detection_pipeline import ImageOpenVocabularyDetectionPipeline
from .object_detection_3d_pipeline import ObjectDetection3DPipeline
from .ddpm_semantic_segmentation_pipeline import DDPMImageSemanticSegmentationPipeline
from .image_inpainting_sdv2_pipeline import ImageInpaintingSDV2Pipeline
@@ -244,6 +245,9 @@ else:
],
'video_colorization_pipeline': ['VideoColorizationPipeline'],
'image_defrcn_fewshot_pipeline': ['ImageDefrcnDetectionPipeline'],
'image_open_vocabulary_detection_pipeline': [
'ImageOpenVocabularyDetectionPipeline'
],
'object_detection_3d_pipeline': ['ObjectDetection3DPipeline'],
'image_inpainting_sdv2_pipeline': ['ImageInpaintingSDV2Pipeline'],
'image_quality_assessment_mos_pipeline': [

View File

@@ -0,0 +1,76 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Any, Dict, Union
import cv2
import numpy as np
import PIL
import torch
from PIL import Image
from modelscope.metainfo import Pipelines
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Model, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
logger = get_logger()
@PIPELINES.register_module(
Tasks.open_vocabulary_detection,
module_name=Pipelines.open_vocabulary_detection_vild)
# @PIPELINES.register_module(
# Tasks.image_object_detection, module_name=Pipelines.open_vocabulary_detection)
class ImageOpenVocabularyDetectionPipeline(Pipeline):
def __init__(self, model: str, **kwargs):
"""
use `model` to create a image open vocabulary detection pipeline for prediction
Args:
model: model id on modelscope hub.
Example:
>>> from modelscope.pipelines import pipeline
>>> vild_pipeline = pipeline(Tasks.open_vocabulary_detection,
model='damo/cv_resnet152_open-vocabulary-detection_vild')
>>> image_path = 'test.jpg'
>>> category_names = ';'.join([
'flipflop', 'street sign', 'bracelet', 'necklace', 'shorts',
'floral camisole', 'orange shirt', 'purple dress', 'yellow tee',
'green umbrella', 'pink striped umbrella', 'transparent umbrella',
'plain pink umbrella', 'blue patterned umbrella', 'koala',
'electric box', 'car', 'pole'
])
>>> input_dict = {'img':image_path, 'category_names':category_names}
>>> result = vild_pipeline(input_dict)
>>> print(result[OutputKeys.BOXES])
"""
super().__init__(model=model, **kwargs)
logger.info('open vocabulary detection model, pipeline init')
def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
# img_path, category_names = input[0], input[1]
img = LoadImage(mode='rgb')(input['img'])['img']
data = {'img': img, 'category_names': input['category_names']}
return data
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
results = self.model.forward(**input)
return results
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
scores, labels, bboxes = self.model.postprocess(inputs)
outputs = {
OutputKeys.SCORES: scores,
OutputKeys.LABELS: labels,
OutputKeys.BOXES: bboxes
}
return outputs

View File

@@ -47,6 +47,7 @@ class CVTasks(object):
image_object_detection = 'image-object-detection'
video_object_detection = 'video-object-detection'
image_fewshot_detection = 'image-fewshot-detection'
open_vocabulary_detection = 'open-vocabulary-detection'
object_detection_3d = 'object-detection-3d'
image_segmentation = 'image-segmentation'

View File

@@ -0,0 +1,83 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import unittest
import cv2
from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.cv.image_utils import draw_box
from modelscope.utils.demo_utils import DemoCompatibilityCheck
from modelscope.utils.logger import get_logger
from modelscope.utils.test_utils import test_level
logger = get_logger()
class ImageOpenVocabularyDetectionTest(unittest.TestCase,
DemoCompatibilityCheck):
def setUp(self) -> None:
os.system(
'pip install tensorflow==2.9.2 -i https://pypi.tuna.tsinghua.edu.cn/simple'
)
logger.info('upgrade tensorflow finished')
self.task = Tasks.open_vocabulary_detection
self.model_id = 'damo/cv_resnet152_open-vocabulary-detection_vild'
self.image = 'data/test/images/image_open_vocabulary_detection.jpg'
self.category_names = ';'.join([
'flipflop', 'street sign', 'bracelet', 'necklace', 'shorts',
'floral camisole', 'orange shirt', 'purple dress', 'yellow tee',
'green umbrella', 'pink striped umbrella', 'transparent umbrella',
'plain pink umbrella', 'blue patterned umbrella', 'koala',
'electric box', 'car', 'pole'
])
self.input = {'img': self.image, 'category_names': self.category_names}
def tearDown(self) -> None:
os.system(
'pip install tensorflow-gpu==1.15 -i https://pypi.tuna.tsinghua.edu.cn/simple'
)
logger.info('degrade tensorflow finished')
return super().tearDown()
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
vild_pipeline = pipeline(task=self.task, model=model)
result = vild_pipeline(input=self.input)
image = cv2.imread(self.image)
draw_box(image, result[OutputKeys.BOXES][0, :])
cv2.imwrite('result_modelhub.jpg', image)
print('Test run with model from modelhub ok.')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_name(self):
vild_pipeline = pipeline(task=self.task, model=self.model_id)
result = vild_pipeline(self.input)
image = cv2.imread(self.image)
draw_box(image, result[OutputKeys.BOXES][0, :])
cv2.imwrite('result_modelname.jpg', image)
print('Test run with model name ok.')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_by_direct_model_download(self):
cache_path = snapshot_download(self.model_id)
vild_pipeline = pipeline(self.task, model=cache_path)
result = vild_pipeline(input=self.input)
image = cv2.imread(self.image)
draw_box(image, result[OutputKeys.BOXES][0, :])
cv2.imwrite('result_snapshot.jpg', image)
print('Test run with snapshot ok.')
@unittest.skip('demo compatibility test is only enabled on a needed-basis')
def test_demo_compatibility(self):
self.compatibility_check()
if __name__ == '__main__':
unittest.main()