Feature/image normal estimation (#683)

* image_normal_estimation

* image_normal_estimation

* update according to pr review

* update submodule data test

---------

Co-authored-by: Weihao Yuan <qianmu.ywh@alibaba-inc.com>
This commit is contained in:
Weihao Yuan
2024-01-09 11:53:02 +08:00
committed by GitHub
parent 2d528ed482
commit 105247140c
15 changed files with 1322 additions and 4 deletions

View File

@@ -52,6 +52,7 @@ class Models(object):
vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
text_driven_segmentation = 'text-driven-segmentation'
newcrfs_depth_estimation = 'newcrfs-depth-estimation'
omnidata_normal_estimation = 'omnidata-normal-estimation'
panovit_layout_estimation = 'panovit-layout-estimation'
unifuse_depth_estimation = 'unifuse-depth-estimation'
s2net_depth_estimation = 's2net-depth-estimation'
@@ -388,6 +389,7 @@ class Pipelines(object):
language_guided_video_summarization = 'clip-it-video-summarization'
image_semantic_segmentation = 'image-semantic-segmentation'
image_depth_estimation = 'image-depth-estimation'
image_normal_estimation = 'image-normal-estimation'
indoor_layout_estimation = 'indoor-layout-estimation'
video_depth_estimation = 'video-depth-estimation'
panorama_depth_estimation = 'panorama-depth-estimation'
@@ -783,6 +785,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.image_depth_estimation:
(Pipelines.image_depth_estimation,
'damo/cv_newcrfs_image-depth-estimation_indoor'),
Tasks.image_normal_estimation:
(Pipelines.image_normal_estimation,
'Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal'),
Tasks.indoor_layout_estimation:
(Pipelines.indoor_layout_estimation,
'damo/cv_panovit_indoor-layout-estimation'),
@@ -820,9 +825,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
'damo/cv_convnextTiny_ocr-recognition-general_damo'),
Tasks.skin_retouching: (Pipelines.skin_retouching,
'damo/cv_unet_skin-retouching'),
Tasks.faq_question_answering:
(Pipelines.faq_question_answering,
'damo/nlp_structbert_faq-question-answering_chinese-base'),
Tasks.faq_question_answering: (
Pipelines.faq_question_answering,
'damo/nlp_structbert_faq-question-answering_chinese-base'),
Tasks.crowd_counting: (Pipelines.crowd_counting,
'damo/cv_hrnet_crowd-counting_dcanet'),
Tasks.video_single_object_tracking: (

View File

@@ -0,0 +1,22 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .omnidata_model import OmnidataNormalEstimation
else:
_import_structure = {
'omnidata_model': ['OmnidataNormalEstimation'],
}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -0,0 +1,20 @@
# This implementation is adopted from MiDaS
# made publicly available under the MIT license
# https://github.com/isl-org/MiDaS
import torch
class BaseModel(torch.nn.Module):
def load(self, path):
"""Load model from file.
Args:
path (str): file path
"""
parameters = torch.load(path, map_location=torch.device('cpu'))
if 'optimizer' in parameters:
parameters = parameters['model']
self.load_state_dict(parameters)

View File

@@ -0,0 +1,395 @@
# This implementation is adopted from MiDaS
# made publicly available under the MIT license
# https://github.com/isl-org/MiDaS
import torch
import torch.nn as nn
from .vit import (_make_pretrained_vitb16_384, _make_pretrained_vitb_rn50_384,
_make_pretrained_vitl16_384, forward_vit)
def _make_encoder(
backbone,
features,
use_pretrained,
groups=1,
expand=False,
exportable=True,
hooks=None,
use_vit_only=False,
use_readout='ignore',
):
if backbone == 'vitl16_384':
pretrained = _make_pretrained_vitl16_384(
use_pretrained, hooks=hooks, use_readout=use_readout)
scratch = _make_scratch(
[256, 512, 1024, 1024], features, groups=groups,
expand=expand) # ViT-L/16 - 85.0% Top1 (backbone)
elif backbone == 'vitb_rn50_384':
pretrained = _make_pretrained_vitb_rn50_384(
use_pretrained,
hooks=hooks,
use_vit_only=use_vit_only,
use_readout=use_readout,
)
scratch = _make_scratch(
[256, 512, 768, 768], features, groups=groups,
expand=expand) # ViT-H/16 - 85.0% Top1 (backbone)
elif backbone == 'vitb16_384':
pretrained = _make_pretrained_vitb16_384(
use_pretrained, hooks=hooks, use_readout=use_readout)
scratch = _make_scratch(
[96, 192, 384, 768], features, groups=groups,
expand=expand) # ViT-B/16 - 84.6% Top1 (backbone)
elif backbone == 'resnext101_wsl':
pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
scratch = _make_scratch([256, 512, 1024, 2048],
features,
groups=groups,
expand=expand) # efficientnet_lite3
elif backbone == 'efficientnet_lite3':
pretrained = _make_pretrained_efficientnet_lite3(
use_pretrained, exportable=exportable)
scratch = _make_scratch([32, 48, 136, 384],
features,
groups=groups,
expand=expand) # efficientnet_lite3
else:
print(f"Backbone '{backbone}' not implemented")
assert False
return pretrained, scratch
def _make_scratch(in_shape, out_shape, groups=1, expand=False):
scratch = nn.Module()
out_shape1 = out_shape
out_shape2 = out_shape
out_shape3 = out_shape
out_shape4 = out_shape
if expand is True:
out_shape1 = out_shape
out_shape2 = out_shape * 2
out_shape3 = out_shape * 4
out_shape4 = out_shape * 8
scratch.layer1_rn = nn.Conv2d(
in_shape[0],
out_shape1,
kernel_size=3,
stride=1,
padding=1,
bias=False,
groups=groups)
scratch.layer2_rn = nn.Conv2d(
in_shape[1],
out_shape2,
kernel_size=3,
stride=1,
padding=1,
bias=False,
groups=groups)
scratch.layer3_rn = nn.Conv2d(
in_shape[2],
out_shape3,
kernel_size=3,
stride=1,
padding=1,
bias=False,
groups=groups)
scratch.layer4_rn = nn.Conv2d(
in_shape[3],
out_shape4,
kernel_size=3,
stride=1,
padding=1,
bias=False,
groups=groups)
return scratch
def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
efficientnet = torch.hub.load(
'rwightman/gen-efficientnet-pytorch',
'tf_efficientnet_lite3',
pretrained=use_pretrained,
exportable=exportable)
return _make_efficientnet_backbone(efficientnet)
def _make_efficientnet_backbone(effnet):
pretrained = nn.Module()
pretrained.layer1 = nn.Sequential(effnet.conv_stem, effnet.bn1,
effnet.act1, *effnet.blocks[0:2])
pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
return pretrained
def _make_resnet_backbone(resnet):
pretrained = nn.Module()
pretrained.layer1 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu,
resnet.maxpool, resnet.layer1)
pretrained.layer2 = resnet.layer2
pretrained.layer3 = resnet.layer3
pretrained.layer4 = resnet.layer4
return pretrained
def _make_pretrained_resnext101_wsl(use_pretrained):
resnet = torch.hub.load('facebookresearch/WSL-Images',
'resnext101_32x8d_wsl')
return _make_resnet_backbone(resnet)
class Interpolate(nn.Module):
"""Interpolation module.
"""
def __init__(self, scale_factor, mode, align_corners=False):
"""Init.
Args:
scale_factor (float): scaling
mode (str): interpolation mode
"""
super(Interpolate, self).__init__()
self.interp = nn.functional.interpolate
self.scale_factor = scale_factor
self.mode = mode
self.align_corners = align_corners
def forward(self, x):
"""Forward pass.
Args:
x (tensor): input
Returns:
tensor: interpolated data
"""
x = self.interp(
x,
scale_factor=self.scale_factor,
mode=self.mode,
align_corners=self.align_corners)
return x
class ResidualConvUnit(nn.Module):
"""Residual convolution module.
"""
def __init__(self, features):
"""Init.
Args:
features (int): number of features
"""
super().__init__()
self.conv1 = nn.Conv2d(
features, features, kernel_size=3, stride=1, padding=1, bias=True)
self.conv2 = nn.Conv2d(
features, features, kernel_size=3, stride=1, padding=1, bias=True)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
"""Forward pass.
Args:
x (tensor): input
Returns:
tensor: output
"""
out = self.relu(x)
out = self.conv1(out)
out = self.relu(out)
out = self.conv2(out)
return out + x
class FeatureFusionBlock(nn.Module):
"""Feature fusion block.
"""
def __init__(self, features):
"""Init.
Args:
features (int): number of features
"""
super(FeatureFusionBlock, self).__init__()
self.resConfUnit1 = ResidualConvUnit(features)
self.resConfUnit2 = ResidualConvUnit(features)
def forward(self, *xs):
"""Forward pass.
Returns:
tensor: output
"""
output = xs[0]
if len(xs) == 2:
output += self.resConfUnit1(xs[1])
output = self.resConfUnit2(output)
output = nn.functional.interpolate(
output, scale_factor=2, mode='bilinear', align_corners=True)
return output
class ResidualConvUnit_custom(nn.Module):
"""Residual convolution module.
"""
def __init__(self, features, activation, bn):
"""Init.
Args:
features (int): number of features
"""
super().__init__()
self.bn = bn
self.groups = 1
self.conv1 = nn.Conv2d(
features,
features,
kernel_size=3,
stride=1,
padding=1,
bias=True,
groups=self.groups)
self.conv2 = nn.Conv2d(
features,
features,
kernel_size=3,
stride=1,
padding=1,
bias=True,
groups=self.groups)
if self.bn is True:
self.bn1 = nn.BatchNorm2d(features)
self.bn2 = nn.BatchNorm2d(features)
self.activation = activation
self.skip_add = nn.quantized.FloatFunctional()
def forward(self, x):
"""Forward pass.
Args:
x (tensor): input
Returns:
tensor: output
"""
out = self.activation(x)
out = self.conv1(out)
if self.bn is True:
out = self.bn1(out)
out = self.activation(out)
out = self.conv2(out)
if self.bn is True:
out = self.bn2(out)
if self.groups > 1:
out = self.conv_merge(out)
return self.skip_add.add(out, x)
# return out + x
class FeatureFusionBlock_custom(nn.Module):
"""Feature fusion block.
"""
def __init__(self,
features,
activation,
deconv=False,
bn=False,
expand=False,
align_corners=True):
"""Init.
Args:
features (int): number of features
"""
super(FeatureFusionBlock_custom, self).__init__()
self.deconv = deconv
self.align_corners = align_corners
self.groups = 1
self.expand = expand
out_features = features
if self.expand is True:
out_features = features // 2
self.out_conv = nn.Conv2d(
features,
out_features,
kernel_size=1,
stride=1,
padding=0,
bias=True,
groups=1)
self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
self.skip_add = nn.quantized.FloatFunctional()
def forward(self, *xs):
"""Forward pass.
Returns:
tensor: output
"""
output = xs[0]
if len(xs) == 2:
res = self.resConfUnit1(xs[1])
output = self.skip_add.add(output, res)
# output += res
output = self.resConfUnit2(output)
output = nn.functional.interpolate(
output,
scale_factor=2,
mode='bilinear',
align_corners=self.align_corners)
output = self.out_conv(output)
return output

View File

@@ -0,0 +1,108 @@
# This implementation is adopted from MiDaS
# made publicly available under the MIT license
# https://github.com/isl-org/MiDaS
import torch
import torch.nn as nn
import torch.nn.functional as F
from .base_model import BaseModel
from .blocks import (FeatureFusionBlock, FeatureFusionBlock_custom,
Interpolate, _make_encoder, forward_vit)
def _make_fusion_block(features, use_bn):
return FeatureFusionBlock_custom(
features,
nn.ReLU(False),
deconv=False,
bn=use_bn,
expand=False,
align_corners=True,
)
class DPT(BaseModel):
def __init__(
self,
head,
features=256,
backbone='vitb_rn50_384',
readout='project',
channels_last=False,
use_bn=False,
):
super(DPT, self).__init__()
self.channels_last = channels_last
hooks = {
'vitb_rn50_384': [0, 1, 8, 11],
'vitb16_384': [2, 5, 8, 11],
'vitl16_384': [5, 11, 17, 23],
}
# Instantiate backbone and reassemble blocks
self.pretrained, self.scratch = _make_encoder(
backbone,
features,
False, # Set to true of you want to train from scratch, uses ImageNet weights
groups=1,
expand=False,
exportable=False,
hooks=hooks[backbone],
use_readout=readout,
)
self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
self.scratch.output_conv = head
def forward(self, x):
if self.channels_last is True:
x.contiguous(memory_format=torch.channels_last)
layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
layer_1_rn = self.scratch.layer1_rn(layer_1)
layer_2_rn = self.scratch.layer2_rn(layer_2)
layer_3_rn = self.scratch.layer3_rn(layer_3)
layer_4_rn = self.scratch.layer4_rn(layer_4)
path_4 = self.scratch.refinenet4(layer_4_rn)
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
out = self.scratch.output_conv(path_1)
return out
class DPTDepthModel(DPT):
def __init__(self, path=None, non_negative=True, num_channels=1, **kwargs):
features = kwargs['features'] if 'features' in kwargs else 256
head = nn.Sequential(
nn.Conv2d(
features, features // 2, kernel_size=3, stride=1, padding=1),
Interpolate(scale_factor=2, mode='bilinear', align_corners=True),
nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(True),
nn.Conv2d(32, num_channels, kernel_size=1, stride=1, padding=0),
nn.ReLU(True) if non_negative else nn.Identity(),
nn.Identity(),
)
super().__init__(head, **kwargs)
if path is not None:
self.load(path)
def forward(self, x):
return super().forward(x).squeeze(dim=1)

View File

@@ -0,0 +1,517 @@
# This implementation is adopted from MiDaS
# made publicly available under the MIT license
# https://github.com/isl-org/MiDaS
import math
import types
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
class Slice(nn.Module):
def __init__(self, start_index=1):
super(Slice, self).__init__()
self.start_index = start_index
def forward(self, x):
return x[:, self.start_index:]
class AddReadout(nn.Module):
def __init__(self, start_index=1):
super(AddReadout, self).__init__()
self.start_index = start_index
def forward(self, x):
if self.start_index == 2:
readout = (x[:, 0] + x[:, 1]) / 2
else:
readout = x[:, 0]
return x[:, self.start_index:] + readout.unsqueeze(1)
class ProjectReadout(nn.Module):
def __init__(self, in_features, start_index=1):
super(ProjectReadout, self).__init__()
self.start_index = start_index
self.project = nn.Sequential(
nn.Linear(2 * in_features, in_features), nn.GELU())
def forward(self, x):
readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
features = torch.cat((x[:, self.start_index:], readout), -1)
return self.project(features)
class Transpose(nn.Module):
def __init__(self, dim0, dim1):
super(Transpose, self).__init__()
self.dim0 = dim0
self.dim1 = dim1
def forward(self, x):
x = x.transpose(self.dim0, self.dim1)
return x
def forward_vit(pretrained, x):
b, c, h, w = x.shape
_ = pretrained.model.forward_flex(x)
layer_1 = pretrained.activations['1']
layer_2 = pretrained.activations['2']
layer_3 = pretrained.activations['3']
layer_4 = pretrained.activations['4']
layer_1 = pretrained.act_postprocess1[0:2](layer_1)
layer_2 = pretrained.act_postprocess2[0:2](layer_2)
layer_3 = pretrained.act_postprocess3[0:2](layer_3)
layer_4 = pretrained.act_postprocess4[0:2](layer_4)
unflatten = nn.Sequential(
nn.Unflatten(
2,
torch.Size([
h // pretrained.model.patch_size[1],
w // pretrained.model.patch_size[0],
]),
))
if layer_1.ndim == 3:
layer_1 = unflatten(layer_1)
if layer_2.ndim == 3:
layer_2 = unflatten(layer_2)
if layer_3.ndim == 3:
layer_3 = unflatten(layer_3)
if layer_4.ndim == 3:
layer_4 = unflatten(layer_4)
layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)](
layer_1)
layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)](
layer_2)
layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)](
layer_3)
layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)](
layer_4)
return layer_1, layer_2, layer_3, layer_4
def _resize_pos_embed(self, posemb, gs_h, gs_w):
posemb_tok, posemb_grid = (
posemb[:, :self.start_index],
posemb[0, self.start_index:],
)
gs_old = int(math.sqrt(len(posemb_grid)))
posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
-1).permute(0, 3, 1, 2)
posemb_grid = F.interpolate(
posemb_grid, size=(gs_h, gs_w), mode='bilinear')
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
return posemb
def forward_flex(self, x):
b, c, h, w = x.shape
pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1],
w // self.patch_size[0])
B = x.shape[0]
if hasattr(self.patch_embed, 'backbone'):
x = self.patch_embed.backbone(x)
if isinstance(x, (list, tuple)):
x = x[
-1] # last feature if backbone outputs list/tuple of features
x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
if getattr(self, 'dist_token', None) is not None:
cls_tokens = self.cls_token.expand(
B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
dist_token = self.dist_token.expand(B, -1, -1)
x = torch.cat((cls_tokens, dist_token, x), dim=1)
else:
cls_tokens = self.cls_token.expand(
B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
x = torch.cat((cls_tokens, x), dim=1)
x = x + pos_embed
x = self.pos_drop(x)
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
return x
activations = {}
def get_activation(name):
def hook(model, input, output):
activations[name] = output
return hook
def get_readout_oper(vit_features, features, use_readout, start_index=1):
if use_readout == 'ignore':
readout_oper = [Slice(start_index)] * len(features)
elif use_readout == 'add':
readout_oper = [AddReadout(start_index)] * len(features)
elif use_readout == 'project':
readout_oper = [
ProjectReadout(vit_features, start_index) for out_feat in features
]
else:
assert (
False
), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
return readout_oper
def _make_vit_b16_backbone(
model,
features=[96, 192, 384, 768],
size=[384, 384],
hooks=[2, 5, 8, 11],
vit_features=768,
use_readout='ignore',
start_index=1,
):
pretrained = nn.Module()
pretrained.model = model
pretrained.model.blocks[hooks[0]].register_forward_hook(
get_activation('1'))
pretrained.model.blocks[hooks[1]].register_forward_hook(
get_activation('2'))
pretrained.model.blocks[hooks[2]].register_forward_hook(
get_activation('3'))
pretrained.model.blocks[hooks[3]].register_forward_hook(
get_activation('4'))
pretrained.activations = activations
readout_oper = get_readout_oper(vit_features, features, use_readout,
start_index)
# 32, 48, 136, 384
pretrained.act_postprocess1 = nn.Sequential(
readout_oper[0],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[0],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[0],
out_channels=features[0],
kernel_size=4,
stride=4,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)
pretrained.act_postprocess2 = nn.Sequential(
readout_oper[1],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[1],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[1],
out_channels=features[1],
kernel_size=2,
stride=2,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)
pretrained.act_postprocess3 = nn.Sequential(
readout_oper[2],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[2],
kernel_size=1,
stride=1,
padding=0,
),
)
pretrained.act_postprocess4 = nn.Sequential(
readout_oper[3],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[3],
kernel_size=1,
stride=1,
padding=0,
),
nn.Conv2d(
in_channels=features[3],
out_channels=features[3],
kernel_size=3,
stride=2,
padding=1,
),
)
pretrained.model.start_index = start_index
pretrained.model.patch_size = [16, 16]
# We inject this function into the VisionTransformer instances so that
# we can use it with interpolated position embeddings without modifying the library source.
pretrained.model.forward_flex = types.MethodType(forward_flex,
pretrained.model)
pretrained.model._resize_pos_embed = types.MethodType(
_resize_pos_embed, pretrained.model)
return pretrained
def _make_pretrained_vitl16_384(pretrained, use_readout='ignore', hooks=None):
model = timm.create_model('vit_large_patch16_384', pretrained=pretrained)
hooks = [5, 11, 17, 23] if hooks is None else hooks
return _make_vit_b16_backbone(
model,
features=[256, 512, 1024, 1024],
hooks=hooks,
vit_features=1024,
use_readout=use_readout,
)
def _make_pretrained_vitb16_384(pretrained, use_readout='ignore', hooks=None):
model = timm.create_model('vit_base_patch16_384', pretrained=pretrained)
hooks = [2, 5, 8, 11] if hooks is None else hooks
return _make_vit_b16_backbone(
model,
features=[96, 192, 384, 768],
hooks=hooks,
use_readout=use_readout)
def _make_pretrained_deitb16_384(pretrained, use_readout='ignore', hooks=None):
model = timm.create_model(
'vit_deit_base_patch16_384', pretrained=pretrained)
hooks = [2, 5, 8, 11] if hooks is None else hooks
return _make_vit_b16_backbone(
model,
features=[96, 192, 384, 768],
hooks=hooks,
use_readout=use_readout)
def _make_pretrained_deitb16_distil_384(pretrained,
use_readout='ignore',
hooks=None):
model = timm.create_model(
'vit_deit_base_distilled_patch16_384', pretrained=pretrained)
hooks = [2, 5, 8, 11] if hooks is None else hooks
return _make_vit_b16_backbone(
model,
features=[96, 192, 384, 768],
hooks=hooks,
use_readout=use_readout,
start_index=2,
)
def _make_vit_b_rn50_backbone(
model,
features=[256, 512, 768, 768],
size=[384, 384],
hooks=[0, 1, 8, 11],
vit_features=768,
use_vit_only=False,
use_readout='ignore',
start_index=1,
):
pretrained = nn.Module()
pretrained.model = model
if use_vit_only:
pretrained.model.blocks[hooks[0]].register_forward_hook(
get_activation('1'))
pretrained.model.blocks[hooks[1]].register_forward_hook(
get_activation('2'))
else:
pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
get_activation('1'))
pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
get_activation('2'))
pretrained.model.blocks[hooks[2]].register_forward_hook(
get_activation('3'))
pretrained.model.blocks[hooks[3]].register_forward_hook(
get_activation('4'))
pretrained.activations = activations
readout_oper = get_readout_oper(vit_features, features, use_readout,
start_index)
if use_vit_only:
pretrained.act_postprocess1 = nn.Sequential(
readout_oper[0],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[0],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[0],
out_channels=features[0],
kernel_size=4,
stride=4,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)
pretrained.act_postprocess2 = nn.Sequential(
readout_oper[1],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[1],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[1],
out_channels=features[1],
kernel_size=2,
stride=2,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)
else:
pretrained.act_postprocess1 = nn.Sequential(nn.Identity(),
nn.Identity(),
nn.Identity())
pretrained.act_postprocess2 = nn.Sequential(nn.Identity(),
nn.Identity(),
nn.Identity())
pretrained.act_postprocess3 = nn.Sequential(
readout_oper[2],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[2],
kernel_size=1,
stride=1,
padding=0,
),
)
pretrained.act_postprocess4 = nn.Sequential(
readout_oper[3],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[3],
kernel_size=1,
stride=1,
padding=0,
),
nn.Conv2d(
in_channels=features[3],
out_channels=features[3],
kernel_size=3,
stride=2,
padding=1,
),
)
pretrained.model.start_index = start_index
pretrained.model.patch_size = [16, 16]
# We inject this function into the VisionTransformer instances so that
# we can use it with interpolated position embeddings without modifying the library source.
pretrained.model.forward_flex = types.MethodType(forward_flex,
pretrained.model)
# We inject this function into the VisionTransformer instances so that
# we can use it with interpolated position embeddings without modifying the library source.
pretrained.model._resize_pos_embed = types.MethodType(
_resize_pos_embed, pretrained.model)
return pretrained
def _make_pretrained_vitb_rn50_384(pretrained,
use_readout='ignore',
hooks=None,
use_vit_only=False):
model = timm.create_model('vit_base_resnet50_384', pretrained=pretrained)
hooks = [0, 1, 8, 11] if hooks is None else hooks
return _make_vit_b_rn50_backbone(
model,
features=[256, 512, 768, 768],
size=[384, 384],
hooks=hooks,
use_vit_only=use_vit_only,
use_readout=use_readout,
)

View File

@@ -0,0 +1,54 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Model: Omnidata: A Scalable Pipeline for Making Multi-Task Mid-Level Vision Datasets from 3D Scans
# Paper link: https://arxiv.org/pdf/2110.04994.pdf
import os.path as osp
import torch
from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.cv.image_normal_estimation.modules.midas.dpt_depth import \
DPTDepthModel
from modelscope.outputs import OutputKeys
from modelscope.utils.constant import ModelFile, Tasks
@MODELS.register_module(
Tasks.image_normal_estimation,
module_name=Models.omnidata_normal_estimation)
class OmnidataNormalEstimation(TorchModel):
def __init__(self, model_dir: str, **kwargs):
"""str -- model file root."""
super().__init__(model_dir, **kwargs)
# build model
self.model = DPTDepthModel(
backbone='vitb_rn50_384', num_channels=3) # DPT Hybrid
# checkpoint = torch.load(pretrained_weights_path, map_location=map_location)
# load model
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
checkpoint = torch.load(model_path, map_location='cpu')
if 'state_dict' in checkpoint:
state_dict = {}
for k, v in checkpoint['state_dict'].items():
state_dict[k[6:]] = v
else:
state_dict = checkpoint
self.model.load_state_dict(state_dict)
self.model.eval()
def forward(self, inputs):
return self.model(inputs['imgs']).clamp(min=0, max=1)
def postprocess(self, inputs):
normal_result = inputs.flip(1)
results = {OutputKeys.NORMALS: normal_result}
return results
def inference(self, data):
results = self.forward(data)
return results

View File

@@ -25,6 +25,8 @@ class OutputKeys(object):
MASKS = 'masks'
DEPTHS = 'depths'
DEPTHS_COLOR = 'depths_color'
NORMALS = 'normals'
NORMALS_COLOR = 'normals_color'
LAYOUT = 'layout'
TEXT = 'text'
POLYGONS = 'polygons'

View File

@@ -0,0 +1,154 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Any, Dict, Union
import cv2
import numpy as np
import PIL
import torch
from modelscope.metainfo import Pipelines
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Model, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
logger = get_logger()
@PIPELINES.register_module(
Tasks.image_normal_estimation,
module_name=Pipelines.image_normal_estimation)
class ImageNormalEstimationPipeline(Pipeline):
r""" Image Normal Estimation Pipeline.
Examples:
>>> from modelscope.pipelines import pipeline
>>> estimator = pipeline(
>>> Tasks.image_normal_estimation, model='Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal')
>>> estimator("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_normal_estimation.jpg")
>>> {
>>> "normals": array([[[0.09233217, 0.07563387, 0.08025375, ..., 0.06992684,
>>> 0.07490329, 0.14308228],
>>> [0.07833742, 0.06736029, 0.07296766, ..., 0.09184352,
>>> 0.0800755 , 0.09726034],
>>> [0.07676302, 0.06631223, 0.07067154, ..., 0.09527256,
>>> 0.09292313, 0.08056315],
>>> ...,
>>> [0.26432115, 0.29100573, 0.2956126 , ..., 0.2913087 ,
>>> 0.29201347, 0.29539976],
>>> [0.24557455, 0.26430887, 0.28548756, ..., 0.2877307 ,
>>> 0.28856137, 0.2937242 ],
>>> [0.26316068, 0.2718169 , 0.28436714, ..., 0.29435217,
>>> 0.29842147, 0.2943223 ]],
>>> [[0.59257126, 0.6459297 , 0.66572756, ..., 0.68350476,
>>> 0.6882835 , 0.66579086],
>>> [0.7054596 , 0.6592535 , 0.6728153 , ..., 0.6589912 ,
>>> 0.64541686, 0.63954735],
>>> [0.6912665 , 0.6638877 , 0.67816293, ..., 0.6607329 ,
>>> 0.6472897 , 0.64633334],
>>> ...,
>>> [0.04231769, 0.04427819, 0.04816979, ..., 0.04485315,
>>> 0.04652229, 0.04869233],
>>> [0.04601872, 0.03706329, 0.04397734, ..., 0.04522909,
>>> 0.04745695, 0.04823782],
>>> [0.06671816, 0.0520605 , 0.0563788 , ..., 0.04913886,
>>> 0.04974678, 0.04954173]],
>>> [[0.4338835 , 0.43240184, 0.43519282, ..., 0.36894026,
>>> 0.35207224, 0.33153164],
>>> [0.4786287 , 0.4399531 , 0.4350407 , ..., 0.34690523,
>>> 0.3179497 , 0.26544768],
>>> [0.47692937, 0.4416514 , 0.437603 , ..., 0.34660107,
>>> 0.3102659 , 0.27787644],
>>> ...,
>>> [0.49566334, 0.48355937, 0.48710674, ..., 0.4964854 ,
>>> 0.48945957, 0.49413157],
>>> [0.490632 , 0.4706958 , 0.48100013, ..., 0.48724395,
>>> 0.4799561 , 0.48129278],
>>> [0.49428058, 0.47433382, 0.4823783 , ..., 0.48930234,
>>> 0.48616886, 0.47176325]]], dtype=float32),
>>> 'normals_color': array([[[ 23, 151, 110],
>>> [ 19, 164, 110],
>>> [ 20, 169, 110],
>>> ...,
>>> [ 17, 174, 94],
>>> [ 19, 175, 89],
>>> [ 36, 169, 84]],
>>> [[ 19, 179, 122],
>>> [ 17, 168, 112],
>>> [ 18, 171, 110],
>>> ...,
>>> [ 23, 168, 88],
>>> [ 20, 164, 81],
>>> [ 24, 163, 67]],
>>> [[ 19, 176, 121],
>>> [ 16, 169, 112],
>>> [ 18, 172, 111],
>>> ...,
>>> [ 24, 168, 88],
>>> [ 23, 165, 79],
>>> [ 20, 164, 70]],
>>> ...,
>>> [[ 67, 10, 126],
>>> [ 74, 11, 123],
>>> [ 75, 12, 124],
>>> ...,
>>> [ 74, 11, 126],
>>> [ 74, 11, 124],
>>> [ 75, 12, 126]],
>>> [[ 62, 11, 125],
>>> [ 67, 9, 120],
>>> [ 72, 11, 122],
>>> ...,
>>> [ 73, 11, 124],
>>> [ 73, 12, 122],
>>> [ 74, 12, 122]],
>>> [[ 67, 17, 126],
>>> [ 69, 13, 120],
>>> [ 72, 14, 123],
>>> ...,
>>> [ 75, 12, 124],
>>> [ 76, 12, 123],
>>> [ 75, 12, 120]]], dtype=uint8)}
"""
def __init__(self, model: str, **kwargs):
"""
use `model` to create a image normal estimation pipeline for prediction
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model, **kwargs)
logger.info('normal estimation model, pipeline init')
def preprocess(self, input: Input) -> Dict[str, Any]:
img = LoadImage.convert_to_ndarray(input).astype(np.float32)
H, W = 384, 384
img = cv2.resize(img, [W, H])
img = img.transpose(2, 0, 1) / 255.0
imgs = img[None, ...]
data = {'imgs': imgs}
return data
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
results = self.model.inference(input)
return results
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
results = self.model.postprocess(inputs)
normals = results[OutputKeys.NORMALS]
if isinstance(normals, torch.Tensor):
normals = normals.detach().cpu().squeeze().numpy()
normals_color = (np.transpose(normals,
(1, 2, 0)) * 255).astype(np.uint8)
outputs = {
OutputKeys.NORMALS: normals,
OutputKeys.NORMALS_COLOR: normals_color
}
return outputs

View File

@@ -57,6 +57,7 @@ class CVTasks(object):
semantic_segmentation = 'semantic-segmentation'
image_driving_perception = 'image-driving-perception'
image_depth_estimation = 'image-depth-estimation'
image_normal_estimation = 'image-normal-estimation'
indoor_layout_estimation = 'indoor-layout-estimation'
video_depth_estimation = 'video-depth-estimation'
panorama_depth_estimation = 'panorama-depth-estimation'

View File

@@ -1144,6 +1144,13 @@
"type": "object"
}
},
"image-normal-estimation": {
"input": {},
"parameters": {},
"output": {
"type": "object"
}
},
"image-driving-perception": {
"input": {
"type": "object",

View File

@@ -0,0 +1,33 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import unittest
import cv2
import numpy as np
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level
class ImageNormalEstimationTest(unittest.TestCase):
def setUp(self) -> None:
self.task = 'image-normal-estimation'
self.model_id = 'Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal'
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_image_normal_estimation(self):
input_location = 'data/test/images/image_normal_estimation.jpg'
estimator = pipeline(
Tasks.image_normal_estimation, model=self.model_id)
result = estimator(input_location)
normals_vis = result[OutputKeys.NORMALS_COLOR]
cv2.imwrite('result.jpg', normals_vis)
print('test_image_normal_estimation DONE')
if __name__ == '__main__':
unittest.main()